In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, RobustScaler
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [16]:
df = pd.read_csv('/content/indian_liver_patient.csv')
df.columns = df.columns.map(str.lower)
df.head()

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_protiens,albumin,albumin_and_globulin_ratio,dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         583 non-null    int64  
 1   gender                      583 non-null    object 
 2   total_bilirubin             583 non-null    float64
 3   direct_bilirubin            583 non-null    float64
 4   alkaline_phosphotase        583 non-null    int64  
 5   alamine_aminotransferase    583 non-null    int64  
 6   aspartate_aminotransferase  583 non-null    int64  
 7   total_protiens              583 non-null    float64
 8   albumin                     583 non-null    float64
 9   albumin_and_globulin_ratio  579 non-null    float64
 10  dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [18]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,583.0,,,,44.746141,16.189833,4.0,33.0,45.0,58.0,90.0
gender,583.0,2.0,Male,441.0,,,,,,,
total_bilirubin,583.0,,,,3.298799,6.209522,0.4,0.8,1.0,2.6,75.0
direct_bilirubin,583.0,,,,1.486106,2.808498,0.1,0.2,0.3,1.3,19.7
alkaline_phosphotase,583.0,,,,290.576329,242.937989,63.0,175.5,208.0,298.0,2110.0
alamine_aminotransferase,583.0,,,,80.713551,182.620356,10.0,23.0,35.0,60.5,2000.0
aspartate_aminotransferase,583.0,,,,109.910806,288.918529,10.0,25.0,42.0,87.0,4929.0
total_protiens,583.0,,,,6.48319,1.085451,2.7,5.8,6.6,7.2,9.6
albumin,583.0,,,,3.141852,0.795519,0.9,2.6,3.1,3.8,5.5
albumin_and_globulin_ratio,579.0,,,,0.947064,0.319592,0.3,0.7,0.93,1.1,2.8


In [19]:
df.albumin_and_globulin_ratio.fillna(df.albumin_and_globulin_ratio.mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.albumin_and_globulin_ratio.fillna(df.albumin_and_globulin_ratio.mean(), inplace=True)


In [38]:
df.corr().style.background_gradient(cmap='viridis')

Unnamed: 0,age,gender,total_bilirubin,alkaline_phosphotase,alamine_aminotransferase,albumin_and_globulin_ratio,dataset
age,1.0,0.00119,0.092816,0.021902,-0.063383,-0.174992,-0.1193
gender,0.00119,1.0,0.161103,0.054034,0.171135,-0.002096,-0.047735
total_bilirubin,0.092816,0.161103,1.0,0.379342,0.465926,-0.348195,-0.37268
alkaline_phosphotase,0.021902,0.054034,0.379342,1.0,0.390733,-0.330804,-0.243892
alamine_aminotransferase,-0.063383,0.171135,0.465926,0.390733,1.0,-0.08759,-0.326475
albumin_and_globulin_ratio,-0.174992,-0.002096,-0.348195,-0.330804,-0.08759,1.0,0.234792
dataset,-0.1193,-0.047735,-0.37268,-0.243892,-0.326475,0.234792,1.0


In [22]:
df.drop(['direct_bilirubin', 'aspartate_aminotransferase', 'total_protiens', 'albumin'], axis=1, inplace=True)

In [23]:
skewed_cols = ['albumin_and_globulin_ratio','total_bilirubin', 'alkaline_phosphotase', 'alamine_aminotransferase']

In [26]:
for c in skewed_cols:
    df[c] = df[c].apply('log1p')

In [28]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df.gender.head()

Unnamed: 0,gender
0,0
1,1
2,1
3,1
4,1


In [29]:
rs = RobustScaler()
for c in df[['age', 'gender', 'total_bilirubin', 'alkaline_phosphotase', 'alamine_aminotransferase', 'albumin_and_globulin_ratio']].columns:
    df[c] = rs.fit_transform(df[c].values.reshape(-1, 1))
df.head()

Unnamed: 0,age,gender,total_bilirubin,alkaline_phosphotase,alamine_aminotransferase,albumin_and_globulin_ratio,dataset
0,0.8,-1.0,-0.278571,-0.205486,-0.880118,-0.114392,1
1,0.68,0.0,1.986087,2.128984,0.596977,-0.540218,1
2,0.68,0.0,1.684067,1.541908,0.536171,-0.139286,1
3,0.52,0.0,0.0,-0.258352,-1.043561,0.123492,1
4,1.08,0.0,1.172617,-0.124206,-0.277631,-1.70574,1


In [30]:
from sklearn.utils import resample
df.dataset.value_counts()

Unnamed: 0_level_0,count
dataset,Unnamed: 1_level_1
1,416
2,167


In [31]:
minority = df[df.dataset==2]
majority = df[df.dataset==1]

print('Minority size:', minority.shape)
print('Majority size:', majority.shape)

Minority size: (167, 7)
Majority size: (416, 7)


In [32]:
minority_upsample = resample(minority, replace=True, n_samples=majority.shape[0])
print('Minority upsampled size:', minority_upsample.shape)

Minority upsampled size: (416, 7)


In [33]:
df = pd.concat([minority_upsample, majority], axis=0)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('dataset', axis=1), df['dataset'], test_size=0.25, random_state=123)

print('Train values shape:', X_train.shape)
print('Test values shape:', X_test.shape)
print('Train target shape:', y_train.shape)
print('Test target shape:', y_test.shape)

Train values shape: (624, 6)
Test values shape: (208, 6)
Train target shape: (624,)
Test target shape: (208,)


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [36]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_test_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_test_hat))

LogisticRegression()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.76      0.61      0.68       313
           2       0.68      0.81      0.74       311

    accuracy                           0.71       624
   macro avg       0.72      0.71      0.71       624
weighted avg       0.72      0.71      0.71       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.72      0.45      0.55       103
           2       0.60      0.83      0.70       105

    accuracy                           0.64       208
   macro avg       0.66      0.64      0.62       208
weighted avg       0.66      0.64      0.63       208

Roc_auc score
-------------------------------------------------------
0.6375866851595008

Confusion matrix
-------------------------------------------------------
[[46 57]
 [18 8

In [37]:
model = SVC()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_test_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_test_hat))

SVC()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.85      0.62      0.71       313
           2       0.70      0.89      0.78       311

    accuracy                           0.75       624
   macro avg       0.77      0.75      0.75       624
weighted avg       0.77      0.75      0.75       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.80      0.47      0.59       103
           2       0.63      0.89      0.74       105

    accuracy                           0.68       208
   macro avg       0.71      0.68      0.66       208
weighted avg       0.71      0.68      0.66       208

Roc_auc score
-------------------------------------------------------
0.6758668515950069

Confusion matrix
-------------------------------------------------------
[[48 55]
 [12 93]]


In [39]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_test_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_test_hat))

DecisionTreeClassifier()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       313
           2       1.00      1.00      1.00       311

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.95      0.71      0.81       103
           2       0.77      0.96      0.86       105

    accuracy                           0.84       208
   macro avg       0.86      0.84      0.83       208
weighted avg       0.86      0.84      0.83       208

Roc_auc score
-------------------------------------------------------
0.835321312991216

Confusion matrix
-------------------------------------------------------
[[ 73  30]
 