# Detect liver disease patient with decision tree classifier in python
08-detect-liver-disease-patient-w-dt-in-python

In [1]:
# Setup libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
# Read data into dataframe
df = pd.read_csv('../00-Datasets/liver.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [4]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
df.tail()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.1,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.0,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.0,1
582,38,Male,1.0,0.3,216,21,24,7.3,4.4,1.5,2


In [6]:
df.sample(6)

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
134,18,Male,1.8,0.7,178,35,36,6.8,3.6,1.1,1
576,32,Male,15.0,8.2,289,58,80,5.3,2.2,0.7,1
552,45,Male,0.6,0.1,196,29,30,5.8,2.9,1.0,1
208,70,Female,0.9,0.3,220,53,95,6.1,2.8,0.68,1
171,44,Female,1.9,0.6,298,378,602,6.6,3.3,1.0,1
261,33,Male,1.5,7.0,505,205,140,7.5,3.9,1.0,1


In [7]:
# Check for missing values
df.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [8]:
# Check for duplicate values
df.duplicated().sum()

13

In [13]:
# Explore some features
df['Dataset'].value_counts()

1    416
2    167
Name: Dataset, dtype: int64

In [15]:
df['Gender'].value_counts()

Male      441
Female    142
Name: Gender, dtype: int64

In [16]:
# Explore tabular summary
df.describe()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [17]:
# Prepare dataset
df.dropna(inplace=True)

le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

In [20]:
# Split dataset into 80% train and 20% test
X = df.drop('Dataset', axis=1)
y = df['Dataset']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
# Build model
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [24]:
# Feed test data into model
y_predict = classifier.predict(X_test)

actual_vs_predict = pd.DataFrame({'Actual': y_test,
                              'Predict': y_predict})
actual_vs_predict.sample(12)

Unnamed: 0,Actual,Predict
165,1,1
17,2,1
348,1,1
14,1,1
319,2,1
561,1,1
316,2,2
324,1,1
501,1,1
46,1,2


In [25]:
# Evaluate the model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           1       0.75      0.80      0.78        82
           2       0.43      0.35      0.39        34

    accuracy                           0.67       116
   macro avg       0.59      0.58      0.58       116
weighted avg       0.66      0.67      0.66       116



## Notes
* We wanted to detect patients with liver disease.
* Our dataset had 583 records with 11 features. 5 of which are of data type float, 5 of type integer, and 1 of type object.
* A quick look showed that there were 4 missing values. We dropped the records with missing values. 
* The dataset was split into 80% train and 20% test sets. No validation set was created or used. No hyperparameter tuning occurred.
* We built, trained, and tested a decision tree classifier.
* We evaluated our model using ground truth and several metrics including precision, recall, and f1-score.