In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
liver_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/liver_patient_data/indian_liver_patient_dataset.csv')
liver_data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
liver_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         500 non-null    int64  
 1   Gender                      500 non-null    object 
 2   Total_Bilirubin             500 non-null    float64
 3   Direct_Bilirubin            500 non-null    float64
 4   Alkaline_Phosphotase        500 non-null    int64  
 5   Alamine_Aminotransferase    500 non-null    int64  
 6   Aspartate_Aminotransferase  500 non-null    int64  
 7   Total_Protiens              500 non-null    float64
 8   Albumin                     500 non-null    float64
 9   Albumin_and_Globulin_Ratio  496 non-null    float64
 10  Liver_Problem               500 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 43.1+ KB


In [4]:
liver_data.Albumin_and_Globulin_Ratio.fillna(liver_data['Albumin_and_Globulin_Ratio'].mean(), inplace=True)

In [5]:
le = LabelEncoder()
liver_data.Gender = le.fit_transform(liver_data.Gender)
liver_data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [6]:
X = liver_data.drop('Liver_Problem', axis = 1) 
y = liver_data['Liver_Problem']

In [35]:
y.value_counts()

1    350
2    150
Name: Liver_Problem, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 53)

In [106]:
lr = LogisticRegression(fit_intercept=False, tol=30) #(C=2.559, penalty='l2')
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=30, verbose=0,
                   warm_start=False)

In [107]:
lr_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score 

accuracy_lr = accuracy_score(y_test, lr_pred)
auc_roc_lr = roc_auc_score(y_test, lr_pred)
f1_lr = f1_score(y_test, lr_pred)
print("Accuracy score of the LR model is: {}".format(accuracy_lr))
print("roc_auc_score of the LR model is: {}".format(auc_roc_lr))
print("F1 score of the LR model is: {}".format(f1_lr))

Accuracy score of the LR model is: 0.68
roc_auc_score of the LR model is: 0.5459068294889191
F1 score of the LR model is: 0.7974683544303798


In [10]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
dtc_pred = dtc.predict(X_test)

accuracy_dtc = accuracy_score(y_test, dtc_pred)
auc_roc_dtc = roc_auc_score(y_test, dtc_pred)
f1_dtc = f1_score(y_test, dtc_pred)
print("Accuracy score of the DTC model is: {}".format(accuracy_dtc))
print("roc_auc_score of the DTC model is: {}".format(auc_roc_dtc))
print("F1 score of the DTC model is: {}".format(f1_dtc))

Accuracy score of the DTC model is: 0.51
roc_auc_score of the DTC model is: 0.44210764360018096
F1 score of the DTC model is: 0.6370370370370371


In [149]:
from sklearn.ensemble import RandomForestClassifier

In [181]:
rfc = RandomForestClassifier(criterion='entropy', max_depth=3, random_state=42)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [182]:
rfc_pred = rfc.predict(X_test)

accuracy_rfc = accuracy_score(y_test, rfc_pred)
auc_roc_rfc = roc_auc_score(y_test, rfc_pred)
f1_rfc = f1_score(y_test, rfc_pred)
print("Accuracy score of the RFC model is: {}".format(accuracy_rfc))
print("roc_auc_score of the RFC model is: {}".format(auc_roc_rfc))
print("F1 score of the RFC model is: {}".format(f1_rfc))

Accuracy score of the RFC model is: 0.68
roc_auc_score of the RFC model is: 0.5151515151515151
F1 score of the RFC model is: 0.8072289156626505


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler

In [18]:
# Create an scaler object
sc = StandardScaler()

# Create a pca object
#pca = decomposition.PCA()

# Create a logistic regression object with an L2 penalty
logistic = LogisticRegression()

# Create a pipeline of three steps. First, standardize the data.
# Second, tranform the data with PCA.
# Third, train a logistic regression on the data.
pipe = Pipeline(steps= [('sc', sc),
                        ('logistic', logistic)]) #[('pca', pca),

# Create Parameter Space
# Create a list of a sequence of integers from 1 to 30 (the number of features in X + 1)
n_components = list(range(1,X_test.shape[1]+1,1))
# Create a list of values of the regularization parameter
C = np.logspace(-4, 4, 50)
# Create a list of options for the regularization penalty
penalty = ['l1', 'l2']
# Create a dictionary of all the parameter options 
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict( #pca__n_components=n_components,
                  logistic__C=C,
                  logistic__penalty=penalty)

# Conduct Parameter Optmization With Pipeline
# Create a grid search object
clf = GridSearchCV(pipe, parameters)

# Fit the grid search
clf.fit(X_test, y_test)
# View The Best Parameters
print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])
print('Best C:', clf.best_estimator_.get_params()['logistic__C'])
#print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
print(); print(clf.best_estimator_.get_params()['logistic'])

Best Penalty: l2
Best C: 2.559547922699533

LogisticRegression(C=2.559547922699533, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [19]:
# Use Cross Validation To Evaluate Model
CV_Result = cross_val_score(clf, X_test, y_test, cv=4, n_jobs=-1)
print(); print(CV_Result)
print(); print(CV_Result.mean())
print(); print(CV_Result.std())


[0.68 0.68 0.6  0.72]

0.6699999999999999

0.04358898943540674


In [33]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(max_depth=3, learning_rate=0.30505) #, booster='dart') #, eval_metric='error@0.7') #, objective='binary:hinge')
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.30505, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
xgb_pred = xgb_model.predict(X_test)

accuracy_xgb = accuracy_score(y_test, xgb_pred)
#acc.append(accuracy_xgb)
auc_roc_xgb = roc_auc_score(y_test, xgb_pred)
#auc.append(auc_roc_xgb)
f1_xgb = f1_score(y_test, xgb_pred)
#f1.append(f1_xgb)
print("Accuracy score of the XGB model is: {}".format(accuracy_xgb))
print("roc_auc_score of the XGB model is: {}".format(auc_roc_xgb))
print("F1 score of the XGB model is: {}".format(f1_xgb))

Accuracy score of the XGB model is: 0.55
roc_auc_score of the XGB model is: 0.47195838986883765
F1 score of the XGB model is: 0.6762589928057554


In [24]:
test_new = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/liver_patient_data/indian_liver_patient_new_testdataset.csv')

In [25]:
test_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         82 non-null     int64  
 1   Gender                      82 non-null     object 
 2   Total_Bilirubin             82 non-null     float64
 3   Direct_Bilirubin            82 non-null     float64
 4   Alkaline_Phosphotase        82 non-null     int64  
 5   Alamine_Aminotransferase    82 non-null     int64  
 6   Aspartate_Aminotransferase  82 non-null     int64  
 7   Total_Protiens              82 non-null     float64
 8   Albumin                     82 non-null     float64
 9   Albumin_and_Globulin_Ratio  82 non-null     float64
dtypes: float64(5), int64(4), object(1)
memory usage: 6.5+ KB


In [26]:
test_new.Gender = le.fit_transform(test_new.Gender)
test_new.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,36,1,2.8,1.5,305,28,76,5.9,2.5,0.7
1,42,1,0.8,0.2,127,29,30,4.9,2.7,1.2
2,53,1,19.8,10.4,238,39,221,8.1,2.5,0.4
3,32,1,30.5,17.1,218,39,79,5.5,2.7,0.9
4,32,1,32.6,14.1,219,95,235,5.8,3.1,1.1


In [94]:
lr_predictions = lr.predict(test_new)

In [None]:
res = pd.DataFrame(lr_predictions) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res.index = test_new.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ["prediction"]
res.to_csv("prediction_results_log_reg.csv")

In [161]:
rfc_predictions = rfc.predict(test_new)

In [162]:
res2 = pd.DataFrame(rfc_predictions) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
res2.index = test_new.index # its important for comparison. Here "test_new" is your new test dataset
res2.columns = ["prediction"]
res2.to_csv("prediction_results_log_rfc_reg.csv")