# SCRIPT TO BUILD DECISION TREE MODEL
Generation of a model using the entire available dataset<br>
Used best hyperparameter configuration in article (max_depth=4; min_samples_per_leaf=7 <br>
Plots file (png type) is created<br>
Rules file (excel type) is created

In [None]:
#!/usr/bin/env python
# coding: utf-8

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Python version and rest of packages needed

import sys
# print('Python: {}'.format(sys.version))
import scipy
# print('scipy: {}'.format(scipy.__version__))
import numpy as np
# print('numpy: {}'.format(np.__version__))
import matplotlib as mat
# print('matplotlib: {}'.format(mat.__version__))
import pandas as pd
# print('pandas: {}'.format(pd.__version__))
import sklearn as sk
# print('sklearn: {}'.format(sk.__version__))
import pyreadstat
# print('pyreadstat: {}'.format(pyreadstat.__version__))
import imblearn as im
# print('imblearn: {}'.format(im.__version__))
import joblib
# print('joblib: {}'.format(joblib.__version__))
import graphviz
# print('graphviz: {}'.format(graphviz.__version__))

In [None]:
# Import functions from packages

from collections import Counter
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.tree import _tree
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
# load dataset and visualize dataset

df = pd.read_csv('../../INPUT_dataset/BDsocioeconomic_dummy.csv',delimiter=',',low_memory=False)
df

In [None]:
# enumerate columns

df.columns

In [None]:
# some statistics about imbalanced class
suicidal_behaviour_rate = df.loc[df['Class_suicidal_behaviour']==1].shape[0]/df.shape[0]*100
non_suicidal_behaviour_rate = df.loc[df['Class_suicidal_behaviour']==0].shape[0]/df.shape[0]*100
n_non_yes_suicidal_behaviour = Counter(df['Class_suicidal_behaviour'])
print('The rate of non suicidal behaviour (0) is: {:.2f}%'.format(non_suicidal_behaviour_rate))
print('The rate of suicidal behaviour (1) is: {:.2f}%'.format(suicidal_behaviour_rate))
print('Number of instances of each class:')
print(n_non_yes_suicidal_behaviour)

In [None]:
# visualise an instance to see what it looks like
first_instance = df.iloc[0].drop(columns=['Class_suicidal_behaviour'])
first_instance

## Generation of a model using the entire available dataset

In [None]:
# copy of the original dataset
dataset = df.copy()   
# dataset with the observations. Independent variables without the class
df_values = dataset.drop(columns=['Class_suicidal_behaviour'])
dataset_values = df_values.copy()   

# dataset with the classes. Dependent variable
df_targets = pd.DataFrame(dataset['Class_suicidal_behaviour'], columns=['Class_suicidal_behaviour'])
dataset_targets = df_targets.copy()  

In [None]:
# Setting the importance of classes.
# The cost of making a mistake missing a suicidal behaviour is higher (False Negative are worse than False Positive)
class_weight = {0:0.2, 1:0.8}

# seed for control randomized algorithm
seed = 1

# Setting of the decision tree learner algorithm
rebalanced_model = DecisionTreeClassifier(# max_leaf_nodes=xx, # min_samples_split=xx,
                                            max_depth=4, min_samples_leaf=7,                                                                                           
                                            class_weight=class_weight, random_state=seed)
# second instance of the result becaUse later the error with original data will be calculated
original_data_model = DecisionTreeClassifier(# max_leaf_nodes=xx, # min_samples_split=xx,
                                             max_depth=4, min_samples_leaf=7,                                                                                           
                                             class_weight=class_weight, random_state=seed)

# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 0
         
# some information
if debug_level>=1:
    suicidal_behaviour_rate = dataset_targets.loc[dataset_targets['Class_suicidal_behaviour']==1].shape[0]/dataset_targets.shape[0]*100
    n_non_yes_suicidal_behaviour = Counter(dataset_targets['Class_suicidal_behaviour'])
    print('The rate of suicidal behaviour (1) BEFORE rebalancing is: {:.2f}%'.format(suicidal_behaviour_rate))
    print('Number of instances of each class')
    print(n_non_yes_suicidal_behaviour)      

# to do the rebalancing, it workS better if everything is scaled between 0 and 1
this_scaler = preprocessing.MinMaxScaler()
this_scaler = this_scaler.fit(dataset_values)
dataset_values = this_scaler.transform(dataset_values)

# we rebalance the dataset
oversampling_conf = SMOTE(sampling_strategy=0.1,random_state=seed)
undersamplinf_conf = RandomUnderSampler(sampling_strategy=0.2,random_state=seed)
rebalance_steps = [('o', oversampling_conf), ('u', undersamplinf_conf)]
rebalance_pipeline = Pipeline(steps=rebalance_steps)
dataset_values, dataset_targets = rebalance_pipeline.fit_resample(dataset_values, dataset_targets)

# This model does not need normalised data
# so we return the data to their original values
dataset_values = this_scaler.inverse_transform(dataset_values)

# In models where the data does NOT have to be normalised
# rounding columns that had INTEGER values can help produce a better model
dataset_values[:,0]  = list(map(round, dataset_values[:,0]))    # Sex_M0_F1
dataset_values[:,2]  = list(map(round, dataset_values[:,2]))    # Day_in_week
dataset_values[:,3]  = list(map(round, dataset_values[:,3]))    # Day_in_month
dataset_values[:,4]  = list(map(round, dataset_values[:,4]))    # Month
dataset_values[:,5]  = list(map(round, dataset_values[:,5]))    # Quarter
dataset_values[:,6]  = list(map(round, dataset_values[:,6]))    # Week_in_year
dataset_values[:,7]  = list(map(round, dataset_values[:,7]))    # Week_in_month
dataset_values[:,8]  = list(map(round, dataset_values[:,8]))    # Working_day
dataset_values[:,9]  = list(map(round, dataset_values[:,9]))    # Day1_Night2
dataset_values[:,17] = list(map(round, dataset_values[:,17]))   # Num_requests_last_months

# some information
if debug_level>=1:
    suicidal_behaviour_rate = dataset_targets.loc[dataset_targets['Class_suicidal_behaviour']==1].shape[0]/dataset_targets.shape[0]*100
    n_non_yes_suicidal_behaviour = Counter(dataset_targets['Class_suicidal_behaviour'])
    print('The rate of suicidal behaviour (1) AFTER rebalancing is: {:.2f}%'.format(suicidal_behaviour_rate))
    print('Number of instances of each class')
    print(n_non_yes_suicidal_behaviour)    
    
    
# we train the model with the rebalanced datasets
rebalanced_model.fit(dataset_values, np.ravel(dataset_targets))

# we make a copy of the model because later it is going to be used to calculate the error with original dataset
original_data_model.fit(dataset_values, np.ravel(dataset_targets))

## PLOT trees
Generated with rebalanced dataset<br>
Generated with rebalanced dataset and relabeled with original dataset

In [None]:
# Export the tree to png
dot_data = export_graphviz(rebalanced_model, out_file=None, 
                            feature_names=df_values.columns,  
                            class_names=['Non_suicidal','Suicidal'],
                            filled=True)
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_rebalanced_data_do_NOT_USE_hyperparameter_best")

In [None]:
# Update model with real information in the original dataset

# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 1

# Keep the structure, but remove the counters
for i in range (0,original_data_model.tree_.node_count):
    original_data_model.tree_.value[i] = [[0.,0.]]
    original_data_model.tree_.impurity[i] = 0
    original_data_model.tree_.n_node_samples[i] = 0
    original_data_model.tree_.weighted_n_node_samples[i] = 1.
    
# Fill the nodes with the values of the data without rebalancing (original data)
# Currently is done including, one by one, every instance in the tree
# A more advanced procedure can be proposed in the future

# some code is used to debug the evolution of the process

if debug_level>=1:
    percentageInt_printed = 0

# for every instance
for i in range(0,len(df_values)):
    if debug_level>=1:
        percentageFloat = ((i+1)/(len(df_values)+1))*100
        percentageInt = round(percentageFloat)
        if percentageInt_printed != percentageInt:
            percentageInt_printed = percentageInt
            percentageBar_printed = round(percentageInt_printed / 2)
            sys.stdout.write('\r')
            # the exact output you're looking for:
            sys.stdout.write("[%-50s] %.2f%%" % ('='*percentageBar_printed, percentageInt_printed))
            sys.stdout.flush()
            
    # get the path corresponding to the instance
    node_indicator = original_data_model.decision_path(df_values.iloc[i].to_numpy().reshape(1, -1))
    # for every node that matches the instance in the path
    for j in node_indicator.indices:
        original_data_model.tree_.value[j][0][df_targets.iloc[i]] += 1
        original_data_model.tree_.n_node_samples[j] += 1
        original_data_model.tree_.weighted_n_node_samples[j] += 1        


In [None]:
# Export the tree to png
dot_data = export_graphviz(original_data_model, out_file=None, 
                            feature_names=df_values.columns,  
                            class_names=['Non_suicidal','Suicidal'],
                            filled=True)
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_original_data_hyperparameter_best")

## EXTRACT RULES TO EXCEL FILE

In [None]:
# function to extract the rules in text
def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    count = 0
    rules = []
    for path in paths:
        count = count + 1
        rule = "if "     
        for p in path[:-1]:
            if rule != "if ":
                rule += "\n and "
            rule += str(p)
        rule += "\n then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            if np.sum(classes) == 0:
                rule += f"class: {class_names[l]} (proba: 0%)"
            else:
                rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rule += "\n--------------------------------------------------------------"
        rule += "\n"
        rules += [rule]
        
    return rules

In [None]:
# We extract the rules from the tree trained with rebalanced dataset
# TEXT MODE =  DEBUG_LEVEL = 1
rules = get_rules(rebalanced_model, df_values.columns, ['Non_suicidal','Suicidal'])

print('Rules: ', len(rules))
# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 0
         
# some information
if debug_level>=1:
    print('')
    for r in rules:
        print(r)

In [None]:
# We extract the rules from the tree trained with rebalanced dataset,
# BUT relabelled with original data.
# TEXT MODE =  DEBUG_LEVEL = 1
rules = get_rules(original_data_model, df_values.columns, ['Non_suicidal','Suicidal'])

print('Rules: ', len(rules))
# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 0
         
# some information
if debug_level>=1:
    print('')
    for r in rules:
        print(r)

In [None]:
# We use the above function to extract the rules.
# but reconverted to reformat them into an excel file
def get_rules_EXCEL(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    paths = []
    path = []
    
    total = tree_.node_count # Number of nodes
    tsamples = tree_.n_node_samples[0]
    
    # To check how many times an attribute has been used in all the rules
    useAttributes = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
   
    rawData  = list() # For data
    rawRules = list() # For rules
    amIaLeaf = list() # To check which nodes are leaves
    
    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED: # If NOT leaf
            if path: # It will not run on the root, on the rest of the intermediate nodes it will.
                tempPath = path.copy()
                tempPath += [(tree_.value[node], tree_.n_node_samples[node])]
                paths += [tempPath]
                amIaLeaf.append('False')
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)  # Left node recursion
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths) # Rigth node recursion
        else: # If YES leaf
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            amIaLeaf.append('True')
            
    recurse(0, path, paths)
  
    count = -1
    
    for path in paths:
        count = count + 1
        rule = "if "
        
        # The values of each row of the spreadsheet
        rowValues = [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,
                 None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None]
        
        # Set to 1 if the attribute appears in the rule.
        useLocalAttributes = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 
        
        # Metadata
        extraValues = [0,0,0,0,0,False] 
        
        for p in path[:-1]:
            if rule != "if ":
                rule += "\n and "
            rule += str(p)
            words = str(p).split()
            value = float(words[-1].replace(")", ""))
            
            # Values of each field
            if 'Sex_M0_F1' in str(p):
                useLocalAttributes[0] = 1
                if '>' in str(p):
                    rowValues[0] = 'F'
                else:
                    rowValues[0] = 'M'
            elif 'Age' in str(p):
                useLocalAttributes[1] = 1
                if '>' in str(p):
                    rowValues[1] = value
                else:
                    rowValues[2] = value       
            elif 'Day_in_week' in str(p):
                useLocalAttributes[2] = 1
                if '>' in str(p):
                    rowValues[3] = value
                else:
                    rowValues[4] = value
            elif 'Day_in_month' in str(p):
                useLocalAttributes[3] = 1
                if '>' in str(p):
                    rowValues[5] = value
                else:
                    rowValues[6] = value
            elif 'Month' in str(p):
                useLocalAttributes[4] = 1
                if '>' in str(p):
                    rowValues[7] = value
                else:
                    rowValues[8] = value
            elif 'Quarter' in str(p):
                useLocalAttributes[5] = 1
                if '>' in str(p):
                    rowValues[9] = value
                else:
                    rowValues[10] = value
            elif 'Week_in_year' in str(p):
                useLocalAttributes[6] = 1
                if '>' in str(p):
                    rowValues[11] = value
                else:
                    rowValues[12] = value
            elif 'Week_in_month' in str(p):
                useLocalAttributes[7] = 1
                if '>' in str(p):
                    rowValues[13] = value
                else:
                    rowValues[14] = value
            elif 'Working_day' in str(p):
                useLocalAttributes[8] = 1
                if '>' in str(p):
                    rowValues[15] = value
                else:
                    rowValues[16] = value
            elif 'Day1_Night2' in str(p):
                useLocalAttributes[9] = 1
                if '>' in str(p):
                    rowValues[17] = value
                else:
                    rowValues[18] = value
            elif 'Population_with_income_per_unit_of_consumption_below_50perc_of_median' in str(p):
                useLocalAttributes[10] = 1
                if '>' in str(p):
                    rowValues[19] = value
                else:
                    rowValues[20] = value
            elif 'Population_with_income_per_consumption_unit_below_5000_Euros' in str(p):
                useLocalAttributes[11] = 1
                if '>' in str(p):
                    rowValues[21] = value
                else:
                    rowValues[22] = value
            elif 'Percentage_of_single_person_households' in str(p):
                useLocalAttributes[12] = 1
                if '>' in str(p):
                    rowValues[23] = value
                else:
                    rowValues[24] = value
            elif 'Percentage_of_population_aged_65_and_over' in str(p):
                useLocalAttributes[13] = 1
                if '>' in str(p):
                    rowValues[25] = value
                else:
                    rowValues[26] = value
            elif 'Percentage_of_population_under_18' in str(p):
                useLocalAttributes[14] = 1
                if '>' in str(p):
                    rowValues[27] = value
                else:
                    rowValues[28] = value
            elif 'Average_net_income_per_person' in str(p):
                useLocalAttributes[15] = 1
                if '>' in str(p):
                    rowValues[29] = value
                else:
                    rowValues[30] = value
            elif 'Average_household_size' in str(p):
                useLocalAttributes[16] = 1
                if '>' in str(p):
                    rowValues[31] = value
                else:
                    rowValues[32] = value
            elif 'Num_requests_last_months' in str(p):
                useLocalAttributes[17] = 1
                if '>' in str(p):
                    rowValues[33] = value
                else:
                    rowValues[34] = value
        
        useAttributes = np.add(useAttributes, useLocalAttributes) # We update the use of global attributes
        extraValues[0] = path[-1][1] # Number of samples
        extraValues[1] = path[-1][1]/tsamples # Percentage that the samples represent out of all samples
        if np.sum(path[-1][0][0])==0:
            extraValues[2] = 0
        else: 
            extraValues[2] = path[-1][0][0][1]/np.sum(path[-1][0][0]) # Percentage of suicidal behaviour
        extraValues[3] = len(path[:-1]) # Depth of the rule
        extraValues[4] = sum(useLocalAttributes) # How many different attributes are used in the rule
        extraValues[5] = amIaLeaf[count] # Whether it is a leaf or not
        extraValues.extend(rowValues)
        
        rawData.append(extraValues)
        
        rule += "\n then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)           
            if np.sum(classes) == 0:
                rule += f"class: {class_names[l]} (proba: 0%)"
            else:
                rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
            # rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rawRules.append(rule) 

    # The rules   
    df1 = pd.DataFrame(rawData, columns=['Samples','Coverage','SuicidePercentage','Depth','AttributesUsed','Leaf',
                              'Sex_M0_F1','Age_Inf','Age_Sup','Day_in_week_Inf','Day_in_week_Sup',
                              'Day_in_month_Inf','Day_in_month_Sup','Month_Inf','Month_Sup','Quarter_Inf','Quarter_Sup',
                              'Week_in_year_Inf','Week_in_year_Sup','Week_in_month_Inf',
                              'Week_in_month_Sup','Working_day_Inf','Working_day_Sup','Day1_Night2_Inf','Day1_Night2_Sup',
                              'Population_with_income_per_unit_of_consumption_below_50perc_of_median_Inf',
                              'Population_with_income_per_unit_of_consumption_below_50perc_of_mediana_Sup',
                              'Population_with_income_per_consumption_unit_below_5000_Euros_Inf',
                              'Population_with_income_per_consumption_unit_below_5000_Euros_Sup',
                              'Percentage_of_single_person_households_Inf','Percentage_of_single_person_households_Sup',
                              'Percentage_of_population_aged_65_and_over_Inf','Percentage_of_population_aged_65_and_over_Sup',
                              'Percentage_of_population_under_18_Inf','Percentage_of_population_under_18_Sup',
                              'Average_net_income_per_person_Inf','Average_net_income_per_person_Sup',
                              'Average_household_size_Inf','Average_household_size_Sup',
                              'Num_requests_last_months_Inf','Num_requests_last_months_Sup'])
    df2 = pd.DataFrame(rawRules, columns=['Rule']) # Rule in original text format
    df3 = pd.DataFrame([useAttributes], columns=list(df_values.columns)) # Use of global attributes
       
    return df1,df2,df3

In [None]:
# We run the function with our transformed tree.
df1,df2,df3 = get_rules_EXCEL(rebalanced_model, df_values.columns, ['Non_suicidal','Suicidal'])

with pd.ExcelWriter('RULES_decision_tree_rebalanced_data_do_NOT_USE_hyperparameter_best.xlsx') as writer:  # Export to excel
    df1.to_excel(writer, sheet_name='Rule_Details')
    df2.to_excel(writer, sheet_name='Raw_Rules')
    df3.to_excel(writer, sheet_name='Attribute_Count')

# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 0
         
# some information
if debug_level>=1:
    print('')
    for r in rules:
        print(r)

# Total number of nodes (all) and rules (leaf nodes)
print('Nodes: ',rebalanced_model.tree_.node_count)
print('Rules: ',Counter(rebalanced_model.tree_.feature)[_tree.TREE_UNDEFINED])

In [None]:
# We run the function with our transformed tree.
df1,df2,df3 = get_rules_EXCEL(original_data_model, df_values.columns, ['Non_suicidal','Suicidal'])

with pd.ExcelWriter('RULES_decision_tree_original_data_hyperparameter_best.xlsx') as writer:  # Export to excel
    df1.to_excel(writer, sheet_name='Rule_Details')
    df2.to_excel(writer, sheet_name='Raw_Rules')
    df3.to_excel(writer, sheet_name='Attribute_Count')

# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 0
         
# some information
if debug_level>=1:
    print('')
    for r in rules:
        print(r)

# Total number of nodes (all) and rules (leaf nodes)
print('Nodes: ',original_data_model.tree_.node_count)
print('Rules: ',Counter(original_data_model.tree_.feature)[_tree.TREE_UNDEFINED])