In [186]:
pip install catboost



In [187]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [188]:
dataset = pd.read_csv("cirrhosis.csv")


In [189]:
print(dataset.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             418 non-null    int64  
 1   N_Days         418 non-null    int64  
 2   Status         418 non-null    object 
 3   Drug           312 non-null    object 
 4   Age            418 non-null    int64  
 5   Sex            418 non-null    object 
 6   Ascites        312 non-null    object 
 7   Hepatomegaly   312 non-null    object 
 8   Spiders        312 non-null    object 
 9   Edema          418 non-null    object 
 10  Bilirubin      418 non-null    float64
 11  Cholesterol    284 non-null    float64
 12  Albumin        418 non-null    float64
 13  Copper         310 non-null    float64
 14  Alk_Phos       312 non-null    float64
 15  SGOT           312 non-null    float64
 16  Tryglicerides  282 non-null    float64
 17  Platelets      407 non-null    float64
 18  Prothrombi

In [190]:
print(dataset.isnull().sum())

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64


In [191]:
def fillmissing(column):
  dataset[column].fillna(dataset[column].mean(),inplace = True)
fillmissing("Copper")
fillmissing("Cholesterol")
fillmissing("Alk_Phos")
fillmissing("SGOT")
fillmissing("Tryglicerides")
fillmissing("Platelets")
fillmissing("Prothrombin")

def fillcat(column):
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
fillcat("Drug")
fillcat("Ascites")
fillcat("Hepatomegaly")
fillcat("Spiders")
fillcat("Stage")

In [192]:
print(dataset.isnull().sum())

ID               0
N_Days           0
Status           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64


In [193]:
print("Drugs- ",dataset['Drug'].nunique())
print("Ascites- ",dataset['Ascites'].nunique())
print("Hepatomegaly- ",dataset['Hepatomegaly'].nunique())


Drugs-  2
Ascites-  2
Hepatomegaly-  2


In [194]:
label_encoder = LabelEncoder()

def labelencode(column):
  dataset[column] = label_encoder.fit_transform(dataset[column])


labelencode("Ascites")
labelencode("Hepatomegaly")
labelencode("Spiders")

In [195]:
print(dataset.head())

   ID  N_Days Status             Drug    Age Sex  Ascites  Hepatomegaly  \
0   1     400      D  D-penicillamine  21464   F        1             1   
1   2    4500      C  D-penicillamine  20617   F        0             1   
2   3    1012      D  D-penicillamine  25594   M        0             0   
3   4    1925      D  D-penicillamine  19994   F        0             1   
4   5    1504     CL          Placebo  13918   F        0             1   

   Spiders Edema  Bilirubin  Cholesterol  Albumin  Copper  Alk_Phos    SGOT  \
0        1     Y       14.5        261.0     2.60   156.0    1718.0  137.95   
1        1     N        1.1        302.0     4.14    54.0    7394.8  113.52   
2        0     S        1.4        176.0     3.48   210.0     516.0   96.10   
3        1     S        1.8        244.0     2.54    64.0    6121.8   60.63   
4        1     N        3.4        279.0     3.53   143.0     671.0  113.15   

   Tryglicerides  Platelets  Prothrombin  Stage  
0          172.0      19

In [196]:
dataset = dataset.drop(["ID"],axis = 1)

In [197]:
df_encoded = pd.get_dummies(dataset, columns=['Status', 'Drug', 'Sex','Edema'])

In [198]:
x = df_encoded.iloc[:,:-1].values
y = df_encoded.iloc[:,-1].values

In [199]:
print(df_encoded.head())

   N_Days    Age  Ascites  Hepatomegaly  Spiders  Bilirubin  Cholesterol  \
0     400  21464        1             1        1       14.5        261.0   
1    4500  20617        0             1        1        1.1        302.0   
2    1012  25594        0             0        0        1.4        176.0   
3    1925  19994        0             1        1        1.8        244.0   
4    1504  13918        0             1        1        3.4        279.0   

   Albumin  Copper  Alk_Phos  ...  Status_C  Status_CL  Status_D  \
0     2.60   156.0    1718.0  ...         0          0         1   
1     4.14    54.0    7394.8  ...         1          0         0   
2     3.48   210.0     516.0  ...         0          0         1   
3     2.54    64.0    6121.8  ...         0          0         1   
4     3.53   143.0     671.0  ...         0          1         0   

   Drug_D-penicillamine  Drug_Placebo  Sex_F  Sex_M  Edema_N  Edema_S  Edema_Y  
0                     1             0      1      0  

In [200]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 10)

In [201]:
classifier = CatBoostClassifier()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)

Learning rate set to 0.00645
0:	learn: 0.6818273	total: 10.5ms	remaining: 10.5s
1:	learn: 0.6713556	total: 13.4ms	remaining: 6.68s
2:	learn: 0.6613029	total: 21.3ms	remaining: 7.08s
3:	learn: 0.6520922	total: 25.2ms	remaining: 6.28s
4:	learn: 0.6424270	total: 38.6ms	remaining: 7.68s
5:	learn: 0.6344197	total: 46.5ms	remaining: 7.7s
6:	learn: 0.6238654	total: 49.2ms	remaining: 6.98s
7:	learn: 0.6148633	total: 52.2ms	remaining: 6.47s
8:	learn: 0.6045287	total: 55.3ms	remaining: 6.09s
9:	learn: 0.5939984	total: 58.5ms	remaining: 5.79s
10:	learn: 0.5835297	total: 61.4ms	remaining: 5.52s
11:	learn: 0.5737590	total: 64.5ms	remaining: 5.31s
12:	learn: 0.5636313	total: 67.3ms	remaining: 5.11s
13:	learn: 0.5531367	total: 70.8ms	remaining: 4.98s
14:	learn: 0.5462138	total: 73.5ms	remaining: 4.83s
15:	learn: 0.5362711	total: 77.9ms	remaining: 4.79s
16:	learn: 0.5272365	total: 80.6ms	remaining: 4.66s
17:	learn: 0.5182485	total: 83.2ms	remaining: 4.54s
18:	learn: 0.5111135	total: 87ms	remaining: 4.

In [202]:
cm = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)

In [203]:
print(cm)
print(acc)

[[79  0]
 [ 0  5]]
1.0
