# <span style='font-family:Arial'><span style='color:blue'>Section 1: Import</span>  

In [1]:
# ---------- Import ----------

import pandas as pd
import numpy as np
import copy
import re

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules

#from sklearn.model_selection import KFold
#from sklearn.model_selection import cross_val_score

#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import recall_score
#from sklearn.metrics import precision_score

#from sklearn.metrics import roc_curve
#from sklearn.metrics import auc
#from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.tree import plot_tree

# <span style='font-family:Arial'><span style='color:blue'>Section 2: Read, pre-process</span>  

In [2]:
# ---------- Read ----------

df = pd.read_csv('../data/data_train.csv')
df.drop(columns=['index'], inplace=True)

# ---------- Drop features with significant missing values found from Code 1 ----------

df.drop(columns=['pco2','ph','lactic_acid','basophils','bmi','creatine_kinase','neutrophils','lymphocyte','urine_output'], inplace=True)

# ---------- Drop age ----------

df.drop(columns=['age'], inplace=True)

# ---------- Drop incomplete cases ----------

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# <span style='font-family:Arial'><span style='color:blue'>Section 3: Encode categorical features and target</span>  

In [3]:
print('---------- Before encoding ----------')
print('')
print(df[['group',
          'gendera',
          'hypertensive',
          'atrialfibrillation',
          'chd_with_no_mi',
          'diabetes',
          'deficiencyanemias',
          'depression',
          'hyperlipemia',
          'renal_failure',
          'copd',
          'outcome']][:5])
print('')

# ---------- Encoding ----------

for categorical_column in ['group',
                           'gendera',
                           'hypertensive',
                           'atrialfibrillation',
                           'chd_with_no_mi',
                           'diabetes',
                           'deficiencyanemias',
                           'depression',
                           'hyperlipemia',
                           'renal_failure',
                           'copd',
                           'outcome']:
    df[categorical_column] = df[categorical_column].apply(lambda x: categorical_column + '_' + str(int(x)))
    
print('---------- After encoding ----------')
print('')
print(df[['group',
          'gendera',
          'hypertensive',
          'atrialfibrillation',
          'chd_with_no_mi',
          'diabetes',
          'deficiencyanemias',
          'depression',
          'hyperlipemia',
          'renal_failure',
          'copd',
          'outcome']][:5])

---------- Before encoding ----------

   group  gendera  hypertensive  atrialfibrillation  chd_with_no_mi  diabetes  \
0      1        2             1                   1               0         1   
1      1        2             1                   0               0         0   
2      1        1             0                   0               0         1   
3      2        1             1                   1               0         0   
4      1        1             1                   1               0         0   

   deficiencyanemias  depression  hyperlipemia  renal_failure  copd  outcome  
0                  0           1             0              0     0      1.0  
1                  0           0             0              0     0      1.0  
2                  0           1             0              0     0      0.0  
3                  0           0             0              1     0      0.0  
4                  1           0             0              1     0      0.0  


# <span style='font-family:Arial'><span style='color:blue'>Section 4: Generate association rules</span>  

In [39]:
# ---------- Convert to list ----------

df_list = df[['group',
          'gendera',
          'hypertensive',
          'atrialfibrillation',
          'chd_with_no_mi',
          'diabetes',
          'deficiencyanemias',
          'depression',
          'hyperlipemia',
          'renal_failure',
          'copd',
          'outcome']].values.tolist()

# ---------- Encode dataframe into market basket structure ----------

te = TransactionEncoder()
te_frame = te.fit(df_list).transform(df_list)
df_temp = pd.DataFrame(te_frame, columns=te.columns_)

# ---------- Mining of frequent itemset ----------

frequent_itemsets = fpgrowth(df_temp, min_support=0.1, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.920897,(copd_0)
1,0.914994,(chd_with_no_mi_0)
2,0.708383,(group_1)
3,0.704841,(hypertensive_1)
4,0.667060,(deficiencyanemias_0)
...,...,...
6621,0.100354,"(deficiencyanemias_0, atrialfibrillation_0, ch..."
6622,0.110980,"(renal_failure_0, hyperlipemia_1, atrialfibril..."
6623,0.101535,"(renal_failure_0, chd_with_no_mi_0, hyperlipem..."
6624,0.101535,"(renal_failure_0, copd_0, hyperlipemia_1, atri..."


# <span style='font-family:Arial'><span style='color:blue'>Section 5: Outcome_0</span>  

In [55]:
# ---------- Separate frequent itemset into antecedent and consequent ----------

df_temp_temp = association_rules(frequent_itemsets, metric="lift", min_threshold=1.13)

# ---------- Look for outcome in consequent ----------

temp = []
i = 0
for a in df_temp_temp['consequents']:
    if ('outcome_0' in a) and (len(a)==1):
        temp.append(df_temp_temp.iloc[i, :])
    i = i + 1

temp = pd.DataFrame(temp)
temp.sort_values(by='lift', ascending=False, inplace=True)
temp

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
35750,"(group_1, hyperlipemia_1, hypertensive_1, atri...",(outcome_0),0.115702,0.861865,0.113341,0.979592,1.136595,0.013621,6.768595
30270,"(hypertensive_1, depression_0, gendera_2, defi...",(outcome_0),0.109799,0.861865,0.107438,0.978495,1.135322,0.012806,6.423259
35866,"(deficiencyanemias_0, copd_0, hyperlipemia_1, ...",(outcome_0),0.108619,0.861865,0.106257,0.978261,1.135051,0.012643,6.354191
35792,"(atrialfibrillation_0, group_1, chd_with_no_mi...",(outcome_0),0.106257,0.861865,0.103896,0.977778,1.13449,0.012317,6.216057
35848,"(deficiencyanemias_0, hyperlipemia_1, hyperten...",(outcome_0),0.106257,0.861865,0.103896,0.977778,1.13449,0.012317,6.216057
30279,"(depression_0, gendera_2, copd_0, hypertensive...",(outcome_0),0.105077,0.861865,0.102715,0.977528,1.1342,0.012153,6.146989
35716,"(copd_0, hyperlipemia_1, hypertensive_1, atria...",(outcome_0),0.145218,0.861865,0.141677,0.97561,1.131975,0.016518,5.663518
30790,"(hypertensive_1, deficiencyanemias_1, atrialfi...",(outcome_0),0.142857,0.861865,0.139315,0.975207,1.131507,0.016192,5.571429
29824,"(deficiencyanemias_1, renal_failure_1)",(outcome_0),0.139315,0.861865,0.135773,0.974576,1.130775,0.015702,5.433294


# <span style='font-family:Arial'><span style='color:blue'>Section 6: Outcome_1</span>  

In [52]:
# ---------- Separate frequent itemset into antecedent and consequent ----------

df_temp_temp = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

# ---------- Look for outcome in consequent ----------

temp = []
i = 0
for a in df_temp_temp['consequents']:
    if ('outcome_1' in a) and (len(a)==1):
        temp.append(df_temp_temp.iloc[i, :])
    i = i + 1

temp = pd.DataFrame(temp)
temp.sort_values(by='lift', ascending=False, inplace=True)
temp

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4610,"(deficiencyanemias_0, copd_0, depression_0)",(outcome_1),0.544274,0.138135,0.102715,0.18872,1.366205,0.027532,1.062353
4602,"(deficiencyanemias_0, chd_with_no_mi_0, depres...",(outcome_1),0.559622,0.138135,0.101535,0.181435,1.313462,0.024232,1.052897
4594,"(deficiencyanemias_0, copd_0, chd_with_no_mi_0)",(outcome_1),0.561983,0.138135,0.101535,0.180672,1.307944,0.023905,1.051918
4582,"(deficiencyanemias_0, copd_0)",(outcome_1),0.608028,0.138135,0.109799,0.180583,1.307294,0.02581,1.051803
4590,"(deficiencyanemias_0, depression_0)",(outcome_1),0.598583,0.138135,0.106257,0.177515,1.285086,0.023572,1.04788
4618,"(renal_failure_0, copd_0)",(outcome_1),0.585596,0.138135,0.103896,0.177419,1.284395,0.023005,1.047758
4626,"(renal_failure_0, depression_0)",(outcome_1),0.567887,0.138135,0.100354,0.176715,1.279297,0.021909,1.046862
4631,"(copd_0, hyperlipemia_0)",(outcome_1),0.576151,0.138135,0.100354,0.17418,1.260946,0.020768,1.043648
4622,"(renal_failure_0, chd_with_no_mi_0)",(outcome_1),0.593861,0.138135,0.102715,0.172962,1.252128,0.020683,1.042111
4586,"(deficiencyanemias_0, chd_with_no_mi_0)",(outcome_1),0.616293,0.138135,0.105077,0.170498,1.23429,0.019945,1.039016


# <span style='font-family:Arial'><span style='color:blue'>That's all for tonight, 26 Oct 2021</span>  

In [None]:
fs = frozenset([1, 2, 3, 4, 5])

size = len(fs)
print('frozenset size =', size)

contains_item = 5 in fs
print('fs contains 5 =', contains_item)

not_contains_item = 6 not in fs
print('fs not contains 6 =', not_contains_item)

In [None]:
round(len(df[df['sp_o2']=="spo2_above_90"]) / len(df),6)

In [None]:
round(len(df[ (df['sp_o2']=="spo2_above_90") & (df['nt-probnp']=="nt_probnp_above_450") ]) / len(df),6)

In [None]:
round(len(df[ (df['sp_o2']=="spo2_above_90") & (df['nt-probnp']=="nt_probnp_above_450") ]) / len(df[df['sp_o2']=="spo2_above_90"]),6)

In [None]:
round(len(df[df['nt-probnp']=="nt_probnp_above_450"]) / len(df),6)

In [None]:
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].astype(str).str.lower().str.contains('outcome_1')]
frequent_itemsets

In [None]:
# ---------- ccp versus impurities ----------

dt = DecisionTreeClassifier(class_weight={0:0.136, 1:0.864},
                            random_state=42)

ccp_impurities = dt.cost_complexity_pruning_path(X_train, y_train)

# ---------- depth, leaves, nodes ----------

dt_depth =[]
dt_leaves = []
dt_nodes = []

for i in ccp_impurities['ccp_alphas']:
    dt = DecisionTreeClassifier(class_weight={0:0.136, 1:0.864},
                                random_state=42,
                                ccp_alpha=i)
    dt_result = dt.fit(X_train, y_train)
    dt_depth.append(dt_result.get_depth())
    dt_leaves.append(dt_result.get_n_leaves())
    dt_nodes.append(dt_result.tree_.node_count)

In [None]:
plt.subplots(figsize=(15, 20))

# ---------- impurities ----------

plt.subplot(4, 1, 1)
plt.plot(ccp_impurities['ccp_alphas'], ccp_impurities['impurities'], marker='o', drawstyle="steps-post")

plt.xlabel('alpha', fontsize=20)
plt.xticks(fontsize=14)
plt.xlim(0, 0.06)

plt.ylabel('Impurity', fontsize=20)
plt.yticks(fontsize=14)

# ---------- depth ----------

plt.subplot(4, 1, 2)
plt.plot(ccp_impurities['ccp_alphas'], dt_depth, marker='o', drawstyle="steps-post")

plt.xlabel('alpha', fontsize=20)
plt.xticks(fontsize=14)
plt.xlim(0, 0.06)

plt.ylabel('Depth', fontsize=20)
plt.yticks(fontsize=14)

# ---------- leaves ----------

plt.subplot(4, 1, 3)
plt.plot(ccp_impurities['ccp_alphas'], dt_leaves, marker='o', drawstyle="steps-post")

plt.xlabel('alpha', fontsize=20)
plt.xticks(fontsize=14)
plt.xlim(0, 0.06)

plt.ylabel('Leaves', fontsize=20)
plt.yticks(fontsize=14)

# ---------- nodes ----------

plt.subplot(4, 1, 4)
plt.plot(ccp_impurities['ccp_alphas'], dt_nodes, marker='o', drawstyle="steps-post")

plt.xlabel('alpha', fontsize=20)
plt.xticks(fontsize=14)
plt.xlim(0, 0.06)

plt.ylabel('Nodes', fontsize=20)
plt.yticks(fontsize=14)


<font size='4'><font color=green>**Choose ccp_alpha=0.015 for balance between complexity (nodes, leaves, depth) and impurities (which can affect accuracy)**  

<font size='4'><font color=green>**Want a tree that does not have many nodes and leaves for ease of doctors and nurses to follow its decision path to do something to keep patients who were predicted to die alive**  
<br>  


<br>  

<font size='5'><font color=red>**------------------------- Train -------------------------**  

<br>  

<font size='5'><font color=blue>**Cross validate, validate accuracy**  

In [None]:
# ---------- Cross validate, validate accuracy ----------

cv = KFold(n_splits=5, shuffle=True, random_state=42)

dt = DecisionTreeClassifier(class_weight={0:0.136, 1:0.864},
                            random_state=42,
                            ccp_alpha=0.015)

print('Individual validate accuracy :', cross_val_score(dt, X_train, y_train, scoring='accuracy', cv=cv))
print('Mean validate accuracy :', cross_val_score(dt, X_train, y_train, scoring='accuracy', cv=cv).mean())

<br>  

<font size='5'><font color=blue>**Full train dataset fit, result summary**  

In [None]:
dt = DecisionTreeClassifier(class_weight={0:0.136, 1:0.864},
                            random_state=42,
                            ccp_alpha=0.015)

dt.fit(X_train, y_train)

temp_df = zip(X_train.columns,
              dt.feature_importances_)
temp_df = pd.DataFrame(temp_df, columns=['feature', 'importance'])
temp_df.sort_values(by='importance', ascending=False, inplace=True)
print(temp_df)

<font size='4'><font color=green>**Features that can affect mortality of patients are**  
* <font size='3'><font color=green>**bicarbonate**  
* <font size='3'><font color=green>**rdw**  
* <font size='3'><font color=green>**anion_gap**  
* <font size='3'><font color=green>**leucocyte**  
* <font size='3'><font color=green>**blood_calcium**  
<br>  
<br>  


<br>  

<font size='5'><font color=blue>**Visualise decision tree structure**  

In [None]:
plt.figure(figsize=(20, 20))

plot_tree(dt,
          feature_names=X_train.columns,
          class_names=['alive', 'dead'],
          filled=True,
          fontsize=10);

<br>  

<font size='5'><font color=red>**------------------------- Test -------------------------**  

<br>  

<font size='5'><font color=blue>**Read, pre-process, predict**  

In [None]:
# ---------- Read ----------

df = pd.read_csv('../data/data_test.csv')
df.drop(columns=['index'], inplace=True)

# ---------- Drop features found from Step 1 ----------

df.drop(columns=['pco2','ph','lactic_acid','basophils','bmi','creatine_kinase','neutrophils','lymphocyte','urine_output'], inplace=True)

# ---------- Drop incomplete cases ----------

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# ---------- Form X ----------

X_test = df.drop(columns=['outcome'])

# ---------- Form y ----------

y_test = df['outcome']

# ---------- Predict using best parameters from grid search cross validate ----------

y_test_predicted = dt.predict(X_test)

# ---------- Save predicted ----------

df['outcome_predicted'] = y_test_predicted
df.to_csv('../data/data_test_predicted_dt.csv', na_rep='NaN', index_label='index')

<br>  

<font size='5'><font color=blue>**Evaluate**  

In [None]:
# ---------- Evaluate: Confusion matrix calculation ----------

tn, fp, fn, tp = confusion_matrix(y_test, y_test_predicted).ravel()
print('True positive (dead): ', tp)
print('True negative (alive): ', tn)
print('False positive (dead): ', fp)
print('False negative (alive): ', fn)
print('')

print('Accuracy :', accuracy_score(y_test, y_test_predicted))
print('')

print('Sensitivity or Recall :', recall_score(y_test, y_test_predicted))
print('')

print('Precision or Positive Predictive Value :', precision_score(y_test, y_test_predicted))
print('')

print('Specificity :', tn/(tn+fp))
print('')

print('Negative predictive value :', tn/(tn+fn))

<font size='4'><font color=green>**Test accuracy at 0.712 far from validate accuracy at 0.658 ( 8.21% ) ----> Model has poor generalisation but validate accuracy lower than test accuuracy**  
<font size='4'><font color=green>**Sensitivity low at 0.538 and precision low at 0.256**  
<font size='4'><font color=green>**Specificity low at 0.747 and negative predictive value high at 0.909**  
<br>  


In [None]:
# ---------- Evaluate: Confusion matrix visualisation ----------

temp_df = [[tp, fp], [fn, tn]]
temp_df = pd.DataFrame(temp_df, columns=['dead (positive)', 'alive (negative)'], index=['dead (positive)', 'alive (negative)'])
temp_df

fig, ax = plt.subplots(figsize=(9, 6))

sns.heatmap(temp_df, annot=True, fmt='d', annot_kws={'size': 20})

# Colour map

cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=20)

# Tick labels

ax.tick_params(axis="x", labelsize=20, labelrotation=0)
ax.tick_params(axis="y", labelsize=20, labelrotation=0)

# X axis title

plt.tick_params(axis='x', which='major', labelsize=20, top=True, labeltop=True, bottom=False, labelbottom=False)
ax.xaxis.set_label_position('top')
plt.xlabel('actual', fontsize=20)

# Y axis title

plt.tick_params(axis='y', which='major', labelsize=20, left=True, labelleft=True)
plt.ylabel('predicted', fontsize=20, rotation=0)

<font size='4'><font color=green>**From sensitivity, 46.2% of patients predicted to die will not be picked up for doctors and nurses to do something, through features that can affect mortality, to save them**  

<font size='4'><font color=green>**The same percentage was higher at 66.7% for logistic regression. This means that decision tree has better outcome than logistic regression for such patient.**  
<br>  


In [None]:
# ---------- Evaluate: Area under ROC curve ----------

y_test_predicted_prob = dt.predict_proba(X_test)[:,1]

plt.figure(figsize=(8,5))
fpr, tpr, _ = roc_curve(y_test, y_test_predicted_prob, drop_intermediate=False)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='AUC = ' + str(round(roc_auc, 3)))
plt.plot([0, 1], [0, 1], label='baseline', linestyle='--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('')

print('Area Under Curve (AUC) : ', roc_auc_score(y_test, y_test_predicted_prob))
print('')

<font size='4'><font color=green>**Area under Receiver Operating Characteristic curve poor at 0.683**  
<br>  
