In [1]:
import pandas as pd
import numpy as np

# plottnig
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('dataset-kidney-stone.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,0,1.021,4.91,725,14.0,443,2.45,0
1,1,1.017,5.74,577,20.0,296,4.49,0
2,2,1.008,7.2,321,14.9,101,2.36,0
3,3,1.011,5.51,408,12.6,224,2.15,0
4,4,1.005,6.52,187,7.5,91,1.16,0


In [3]:
df.drop(df.columns[0], axis=1, inplace=True) # removing the first column
df.head()


Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,1.021,4.91,725,14.0,443,2.45,0
1,1.017,5.74,577,20.0,296,4.49,0
2,1.008,7.2,321,14.9,101,2.36,0
3,1.011,5.51,408,12.6,224,2.15,0
4,1.005,6.52,187,7.5,91,1.16,0


In [5]:
df.skew() # checking the skewness of the data

gravity    0.152631
ph         0.693656
osmo       0.298551
cond      -0.016312
urea       0.433230
calc       0.934628
target     0.000000
dtype: float64

In [7]:
# ph and calc has unacceptable skewness so applying Log Transformation on them

df[['ph', 'calc']] = np.log1p(df[['ph', 'calc']])
df.head()


Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,1.021,1.776646,725,14.0,443,1.238374,0
1,1.017,1.90806,577,20.0,296,1.702928,0
2,1.008,2.104134,321,14.9,101,1.211941,0
3,1.011,1.873339,408,12.6,224,1.147402,0
4,1.005,2.017566,187,7.5,91,0.770108,0


In [9]:
df.skew() # new skewness after log transformation


gravity    0.152631
ph         0.453173
osmo       0.298551
cond      -0.016312
urea       0.433230
calc      -0.024415
target     0.000000
dtype: float64

In [11]:
# checking for outliers

for column in df.columns:
    print(len(df[column][df[column] > df[column].quantile(0.75)])) # printing the number of outliers in each column


22
23
23
23
23
23
0


In [14]:
# Applying square root transformation to remove outliers

c = ['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc']

df[c] = np.sqrt(df[c])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gravity  90 non-null     float64
 1   ph       90 non-null     float64
 2   osmo     90 non-null     float64
 3   cond     90 non-null     float64
 4   urea     90 non-null     float64
 5   calc     90 non-null     float64
 6   target   90 non-null     int64  
dtypes: float64(6), int64(1)
memory usage: 5.0 KB


In [15]:
# Standardize the data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

df_scaled.head()


Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,0.454209,-1.777954,0.611284,-0.757811,1.196015,-0.120145,-1.0
1,-0.138782,-0.375026,0.048052,0.071382,0.443779,0.531731,-1.0
2,-1.479433,1.589174,-1.259387,-0.618245,-1.228043,-0.162449,-1.0
3,-1.031555,-0.738463,-0.747724,-0.988936,-0.033601,-0.268725,-1.0
4,-1.928312,0.739668,-2.305702,-2.042285,-1.367513,-1.000712,-1.0


In [18]:
df_scaled['target'] = df_scaled['target'].replace(-1, 0) # convert -1 to 0, for XGBoost to work
print(df_scaled['target'].unique())


[0. 1.]


In [19]:
# Splitting the dataset

from sklearn.model_selection import train_test_split

X = df_scaled.drop('target', axis=1)
y = df_scaled['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# training XGBoost model

import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(X_train, y_train)


In [23]:
# testing the model

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)

print("MAE;", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))


MAE; 0.4444444444444444
MSE: 0.4444444444444444
R^2: -1.0


In [24]:
from sklearn.model_selection import RandomizedSearchCV

param = {
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth': [2, 3, 4],
    'n_estimators': [50, 100, 150],
    'gamma': [0, 0.1, 1]
}

random_cv = RandomizedSearchCV(estimator=model, param_distributions=param, 
                               n_iter=10, scoring='accuracy', cv=5, n_jobs=-1)

random_cv.fit(X_train, y_train)


In [25]:
# training XGBoost model with best parameters

model = xgb.XGBClassifier(learning_rate=random_cv.best_params_['learning_rate'], 
                              max_depth=random_cv.best_params_['max_depth'], 
                              n_estimators=random_cv.best_params_['n_estimators'], 
                              gamma=random_cv.best_params_['gamma'])

model.fit(X_train, y_train)


In [27]:
# MSE, MAE, R^2

y_pred = model.predict(X_test)

print("MAE;", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))

# accuracy
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))


MAE; 0.2777777777777778
MSE: 0.2777777777777778
R^2: -0.25
Accuracy: 0.7222222222222222


In [32]:
y_pred = model.predict(X) # accuracy on the whole dataset

print("Accuracy on whole dataset:", accuracy_score(y, y_pred)*100, "%.")


Accuracy on whole dataset: 92.22222222222223 %.


In [33]:
# Before using Random CV, Error (MSE) was 0.44, after using best parameters from Random CV, it got down to 0.27 only.

# Also accuracy is 72% which is good for this model trained on small dataset of only 80% of 90 rows.

# The XGboost is good and fast, nice to predict kidney stone.