In [1]:
from ml import *

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/data/datasets/indian_liver_patient.csv')

In [4]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


We can convert the categorical feature Gender to 0 and 1, and the target variable Dataset also to 0 and 1.

In [5]:
df.Dataset.unique()

array([1, 2])

In [6]:
df['Gender'] = np.where(df.Gender == 'Male', 1, 0)
df['Dataset'] = df.Dataset.apply(lambda x:x-1)

We check the datatypes, these are all numeric, and the missing values.

In [7]:
df.dtypes

Age                             int64
Gender                          int64
Total_Bilirubin               float64
Direct_Bilirubin              float64
Alkaline_Phosphotase            int64
Alamine_Aminotransferase        int64
Aspartate_Aminotransferase      int64
Total_Protiens                float64
Albumin                       float64
Albumin_and_Globulin_Ratio    float64
Dataset                         int64
dtype: object

In [8]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In this case we opt to drop the rows containing missing values.

In [9]:
df = df.dropna()

Then we extract the Dataset variable as out target variable and all other columns as features. We convert these to numpy arrays.

In [10]:
X = df.drop(columns='Dataset').to_numpy()
y = df.Dataset.to_numpy()


We split the dataset in a train and validation set and fit a logistic regression model. We then look at Precision and Recall for the validation set and these look somewhat disappointing. Only 15% of the liver patients are identified.

In [11]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
model = LogisticRegression()
model.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
pred_y = model.predict(valid_X)

In [14]:
recall_score(valid_y, pred_y)

0.14705882352941177

In [15]:
precision_score(valid_y, pred_y)

0.5555555555555556

# Balancing

In this case we are mostly interested in positive cases if we wish to identify cases of liver disease. However, the dataset appears to be skewed towards people that do not hve the disease. Therefore, the the prediction of negative cases is rewarded more than the prediction of positive cases.

We can balance the dataset by oversampling the positive cases. When we check, we see that the number of negative cases occurs roughly three times more often that the positive cases. So when we simply duplicate the positive samples three times we improve the balance over the two classes. Note that we could also match the number of positive and negative samples exactly, but a rough approximation should already bring much improvement. 

Note that we only oversample the training set! Do not touch the validation or test set.

In [16]:
df.Dataset.value_counts()

0    414
1    165
Name: Dataset, dtype: int64

In [17]:
train_X1 = train_X[train_y==1]
Xb = np.vstack([train_X,train_X1, train_X1])

In [18]:
train_y1 = train_y[train_y==1]
yb = np.hstack([train_y,train_y1, train_y1])

In [19]:
model = LogisticRegression()
model.fit(Xb, yb)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

And indeed, we see that recall has improved from 15% to over 80%. As always, we do see there is a tradeoff against precision from 55% to just over 40%. Whether this is favorable depends on the objective, but for identifying liver patients this appears to be a nice improvement.

In [20]:
pred_y = model.predict(valid_X)

In [21]:
recall_score(valid_y, pred_y)

0.8235294117647058

In [22]:
precision_score(valid_y, pred_y)

0.42424242424242425