## Scrape the dataset form UCI

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl.metadata (5.3 kB)
Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6
Note: you may need to restart the kernel to use updated packages.




In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cirrhosis_patient_survival_prediction = fetch_ucirepo(id=878) 
  
# data (as pandas dataframes) 
X = cirrhosis_patient_survival_prediction.data.features 
y = cirrhosis_patient_survival_prediction.data.targets 
  
# metadata 
print(cirrhosis_patient_survival_prediction.metadata) 
  
# variable information 
print(cirrhosis_patient_survival_prediction.variables) 


{'uci_id': 878, 'name': 'Cirrhosis Patient Survival Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/878/cirrhosis+patient+survival+prediction+dataset-1', 'data_url': 'https://archive.ics.uci.edu/static/public/878/data.csv', 'abstract': 'Utilize 17 clinical features for predicting survival state of patients with liver cirrhosis. The survival states include 0 = D (death), 1 = C (censored), 2 = CL (censored due to liver transplantation).', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 418, 'num_features': 17, 'feature_types': ['Real', 'Categorical'], 'demographics': ['Age', 'Sex'], 'target_col': ['Status'], 'index_col': ['ID'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5R02G', 'creators': ['E. Dickson', 'P. Grambsch', 'T. Fleming', 'L. Fisher', 'A. Langworthy'], 'intro_paper': {'title': 'Prognos

Save the data as csv

In [4]:
X.to_csv('cirrhosis_patient_survival_prediction.csv', index=False)
y.to_csv('cirrhosis_patient_survival_prediction_target.csv', index=False)

In [10]:
features = pd.read_csv('Data/uncleaned/cirrhosis_patient_survival_prediction.csv')
target = pd.read_csv('Data/uncleaned/cirrhosis_patient_survival_prediction_target.csv')
data = pd.concat([features, target], axis=1)
data.to_csv('Data/cleaned_data.csv', index=False)

## Data Preparation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('Data/cleaned_data.csv')
data.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261,2.6,156,1718.0,137.95,172,190,12.2,4.0,D
1,D-penicillamine,20617,F,N,Y,Y,N,1.1,302,4.14,54,7394.8,113.52,88,221,10.6,3.0,C
2,D-penicillamine,25594,M,N,N,N,S,1.4,176,3.48,210,516.0,96.1,55,151,12.0,4.0,D
3,D-penicillamine,19994,F,N,Y,Y,S,1.8,244,2.54,64,6121.8,60.63,92,183,10.3,4.0,D
4,Placebo,13918,F,N,Y,Y,N,3.4,279,3.53,143,671.0,113.15,72,136,10.9,3.0,CL


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Drug           313 non-null    object 
 1   Age            418 non-null    int64  
 2   Sex            418 non-null    object 
 3   Ascites        313 non-null    object 
 4   Hepatomegaly   313 non-null    object 
 5   Spiders        313 non-null    object 
 6   Edema          418 non-null    object 
 7   Bilirubin      418 non-null    float64
 8   Cholesterol    312 non-null    object 
 9   Albumin        418 non-null    float64
 10  Copper         312 non-null    object 
 11  Alk_Phos       312 non-null    float64
 12  SGOT           312 non-null    float64
 13  Tryglicerides  312 non-null    object 
 14  Platelets      411 non-null    object 
 15  Prothrombin    416 non-null    float64
 16  Stage          412 non-null    float64
 17  Status         418 non-null    object 
dtypes: float64

There are couple of numeric and categorical data

In [7]:
columns = data.columns
for col in columns:
    print(f'{col} : {data[col].unique()}')

Drug : ['D-penicillamine' 'Placebo' 'NaNN' nan]
Age : [21464 20617 25594 19994 13918 24201 20284 19379 15526 25772 19619 21600
 16688 20535 23612 14772 19060 19698 18102 21898 23445 20555 20442 16261
 16463 19002 19884 16417 23331 15116 15177 19722 18731 19015 17758 20604
 22546 13378 20232 17046 12285 12307 17850 13727 15265 16728 17323 17947
 22336 19544 19025 18460 24621 14317 24020 12279 19567 16279 14754 21324
 16034 22173 17031 22977 14684 16967 18733 11912 18021 20600 17841 11868
 14060 18964 15895 18972 18199 17512 16990 24622 23107 24585 20459 20392
 17246 19270 13616 15119 19155 12227 16658 28018 13344 19693 16944 26259
 10550 17703 18799 16418 20662 17884 15712 12433 25023 22836 18393 16094
 14212 15031 20256 18713 19295 15574 22306 18137 17844 19817 12839 24803
 20248 16736 19318 17233 19577 16109 15322 23235 16154 22646 14812 22881
 15463 15694 20440 22960 18719 17080 19751 17180 20354 16839 19098 18701
 12369 27398 11273 22574 12779 20104 25546 18118 25340 15909 21699 178

Missing values

In [8]:
for col in columns:
    missing = data[col].isnull().sum()
    print(f'{col} : {missing}')

Drug : 105
Age : 0
Sex : 0
Ascites : 105
Hepatomegaly : 105
Spiders : 105
Edema : 0
Bilirubin : 0
Cholesterol : 106
Albumin : 0
Copper : 106
Alk_Phos : 106
SGOT : 106
Tryglicerides : 106
Platelets : 7
Prothrombin : 2
Stage : 6
Status : 0


Quarter of the data is missing for few fields need to check its dtype

In [13]:
for col in columns:
    missing = data[col].isnull().sum()
    type = data[col].dtype
    if missing > 0:
        print(f'{col} : {missing} missing values, dtype is {type}')

Drug : 105 missing values, dtype is object
Ascites : 105 missing values, dtype is object
Hepatomegaly : 105 missing values, dtype is object
Spiders : 105 missing values, dtype is object
Cholesterol : 106 missing values, dtype is object
Copper : 106 missing values, dtype is object
Alk_Phos : 106 missing values, dtype is float64
SGOT : 106 missing values, dtype is float64
Tryglicerides : 106 missing values, dtype is object
Platelets : 7 missing values, dtype is object
Prothrombin : 2 missing values, dtype is float64
Stage : 6 missing values, dtype is float64


clean the String values first

'S'

In [33]:
data.drop(data[data["Drug"] == "NaNN"].index, inplace=True)

In [38]:
data

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261,2.60,156,1718.0,137.95,172,190,12.2,4.0,D
1,D-penicillamine,20617,F,N,Y,Y,N,1.1,302,4.14,54,7394.8,113.52,88,221,10.6,3.0,C
2,D-penicillamine,25594,M,N,N,N,S,1.4,176,3.48,210,516.0,96.10,55,151,12.0,4.0,D
3,D-penicillamine,19994,F,N,Y,Y,S,1.8,244,2.54,64,6121.8,60.63,92,183,10.3,4.0,D
4,Placebo,13918,F,N,Y,Y,N,3.4,279,3.53,143,671.0,113.15,72,136,10.9,3.0,CL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,D-penicillamine,24472,F,N,Y,N,N,1.2,NaNN,2.96,52,559.0,71.30,NaNN,174,10.9,3.0,D
414,D-penicillamine,14245,F,N,Y,N,N,0.9,NaNN,3.83,52,559.0,71.30,NaNN,180,11.2,4.0,C
415,D-penicillamine,20819,F,N,Y,N,N,1.6,NaNN,3.42,52,559.0,71.30,NaNN,143,9.9,3.0,C
416,D-penicillamine,21185,F,N,Y,N,N,0.8,NaNN,3.75,52,559.0,71.30,NaNN,269,10.4,3.0,C
