In [1]:
import xgboost as xgb 
import pandas as pd 

from sklearn.pipeline import Pipeline 
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder 
from sklearn.model_selection import train_test_split 
from sklearn.impute import SimpleImputer 

In [59]:
data = pd.read_csv('cirrhosis.csv')
necessary_cols = ['Status', 'Age', 'Sex', 'Cholesterol', 'Stage', 'Hepatomegaly', 'Bilirubin', 'Albumin', 'Platelets', 'Drug']

data = data[necessary_cols]
data 

Unnamed: 0,Status,Age,Sex,Cholesterol,Stage,Hepatomegaly,Bilirubin,Albumin,Platelets,Drug
0,D,21464,F,261.0,4.0,Y,14.5,2.60,190.0,D-penicillamine
1,C,20617,F,302.0,3.0,Y,1.1,4.14,221.0,D-penicillamine
2,D,25594,M,176.0,4.0,N,1.4,3.48,151.0,D-penicillamine
3,D,19994,F,244.0,4.0,Y,1.8,2.54,183.0,D-penicillamine
4,CL,13918,F,279.0,3.0,Y,3.4,3.53,136.0,Placebo
...,...,...,...,...,...,...,...,...,...,...
413,D,24472,F,,3.0,,1.2,2.96,174.0,
414,C,14245,F,,4.0,,0.9,3.83,180.0,
415,C,20819,F,,3.0,,1.6,3.42,143.0,
416,C,21185,F,,3.0,,0.8,3.75,269.0,


In [60]:
data.columns

Index(['Status', 'Age', 'Sex', 'Cholesterol', 'Stage', 'Hepatomegaly',
       'Bilirubin', 'Albumin', 'Platelets', 'Drug'],
      dtype='object')

In [12]:
data['Hepatomegaly'].value_counts()

Hepatomegaly
Y    160
N    152
Name: count, dtype: int64

In [54]:

data['Hepatomegaly'] = (data['Hepatomegaly'] == 'Y').astype(int)
data['Sex'] = (data['Sex'] == 'M').astype(int)


num_cols = ['Bilirubin', 'Albumin', 'Platelets', 'Cholesterol']
categorical_cols = ['Drug', 'Status', 'Stage']

num_imputer = SimpleImputer(strategy='constant')
encoder = OrdinalEncoder()

In [65]:
cat_transform = Pipeline(steps=[
    ('imputer', (SimpleImputer(strategy='most_frequent'))),
    ('onehot', (OneHotEncoder(handle_unknown='ignore'))),
    ('ordencode', (OrdinalEncoder()))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, num_cols),
        ('cat', cat_transform, categorical_cols),
        ('enc', encoder, data['Status'])
    ]
)

In [66]:
x = data.drop(columns=['Status'])
y = data['Status']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0) 


model = xgb.XGBClassifier(
                     objective='multi:softmax',
                     max_depth=5,
                     n_estimators=100,
                     random_state=42,
                     learning_rate=0.1,
                )

In [67]:
my_pipeline = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('model', model)
                      ]) 

In [68]:
my_pipeline.fit(x_train, y_train)
survival_prediction = my_pipeline.predict(x_test)
pred_accuracy = my_pipeline.score(x_test, y_test) 

pred_accuracy

ValueError: A given column is not a column of the dataframe

In [61]:
metrics = classification_report(y_test,survival_prediction)

print(metrics)

NameError: name 'survival_prediction' is not defined