## Data Reading, Cleaning and Improving

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('kidney_disease.csv')

In [3]:
df.tail()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
395,395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd
399,399,58.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,53,6800,6.1,no,no,no,good,no,no,notckd


In [4]:
columns = pd.read_csv(r"D:\GIT and Programming\Chronic Kidney Disease/data_description.txt", sep='-')
columns = columns.reset_index()

In [5]:
columns.columns=['cols','abb_col_names']

In [6]:
df.columns=columns['abb_col_names'].values

In [7]:
def convert_dtype(df, feature):
    df[feature] = pd.to_numeric(df[feature], errors='coerce')

In [8]:
features = ['packed cell volume', 'white blood cell count', 'red blood cell count']
for feature in features:
    convert_dtype(df, feature)

In [9]:
df.drop('id', axis=1, inplace=True)

In [10]:
def extract_cat_num(df):
    cat_col = [col for col in df.columns if df[col].dtype == 'object']
    num_col = [col for col in df.columns if df[col].dtype != 'object']
    return cat_col, num_col

In [11]:
cat_col, num_col = extract_cat_num(df)

In [12]:
df['diabetes mellitus'].replace(to_replace={'\tno':'no', '\tyes':'yes'}, inplace=True)
df['coronary artery disease'].replace(to_replace={'\tno':'no'}, inplace=True)
df['class'] = df['class'].replace(to_replace='ckd\t', value='ckd')

In [13]:
data = df.copy()

In [14]:
def assigning_missing_values(feature):
    random_sample=data[feature].dropna().sample(data[feature].isnull().sum())
    random_sample.index = data[data[feature].isnull()].index
    data.loc[data[feature].isnull(), feature] = random_sample

In [15]:
for col in num_col:
    assigning_missing_values(col)

In [16]:
for col in cat_col:
    assigning_missing_values(col)

In [17]:
le = LabelEncoder()

In [18]:
for col in cat_col:
    data[col] = le.fit_transform(data[col])

## Train Test Splitting

In [19]:
def data_split(entry, ratio):
    # np.random.seed(42)
    shuffled = np.random.permutation(len(entry))
    test_set_size = int(len(entry) * ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [20]:
train, test = data_split(data, 0.25)

In [21]:
train

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,ypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
391,36.0,80.0,1.025,0.0,0.0,1,1,0,0,85.0,...,44.0,5800.0,6.3,0,1,0,0,0,0,1
168,65.0,70.0,1.015,4.0,4.0,1,1,1,0,307.0,...,39.0,6700.0,3.2,1,2,0,0,0,0,0
226,64.0,100.0,1.015,4.0,2.0,0,0,0,1,163.0,...,26.0,7500.0,3.4,1,2,0,0,1,0,0
68,65.0,70.0,1.010,2.0,0.0,1,1,1,0,112.0,...,37.0,9200.0,4.3,0,1,0,0,0,0,0
60,67.0,90.0,1.020,1.0,0.0,1,0,1,0,141.0,...,45.0,10200.0,5.6,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,37.0,60.0,1.025,0.0,0.0,1,1,0,0,111.0,...,50.0,5500.0,5.7,0,1,0,0,0,0,1
35,65.0,90.0,1.020,2.0,1.0,0,1,0,0,270.0,...,36.0,9800.0,4.9,1,2,0,1,0,1,0
382,48.0,80.0,1.025,0.0,0.0,1,1,0,0,75.0,...,51.0,6000.0,6.5,0,1,0,0,0,0,1
293,50.0,80.0,1.020,0.0,0.0,1,1,0,0,92.0,...,48.0,4700.0,5.4,0,1,0,0,0,0,1


In [22]:
X_train = train[['age', 'blood pressure', 'specific gravity', 'albumin', 'sugar',
       'red blood cells', ' pus cell', 'pus cell clumps', 'bacteria',
       'blood glucose random', 'blood urea', 'serum creatinine', 'sodium',
       'potassium', 'haemoglobin', 'packed cell volume',
       'white blood cell count', 'red blood cell count', 'ypertension',
       'diabetes mellitus', 'coronary artery disease', 'appetite',
       'pedal edema', 'anemia']].to_numpy()

In [23]:
X_test = test[['age', 'blood pressure', 'specific gravity', 'albumin', 'sugar',
       'red blood cells', ' pus cell', 'pus cell clumps', 'bacteria',
       'blood glucose random', 'blood urea', 'serum creatinine', 'sodium',
       'potassium', 'haemoglobin', 'packed cell volume',
       'white blood cell count', 'red blood cell count', 'ypertension',
       'diabetes mellitus', 'coronary artery disease', 'appetite',
       'pedal edema', 'anemia']].to_numpy()

In [24]:
Y_train = train[['class']].to_numpy().reshape(300,)

In [25]:
Y_test = test[['class']].to_numpy().reshape(100,)

In [26]:
Y_train

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1])

In [27]:
clf = LogisticRegression(solver='lbfgs', max_iter=10000)
clf.fit(X_train, Y_train)

In [28]:
train

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,ypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
391,36.0,80.0,1.025,0.0,0.0,1,1,0,0,85.0,...,44.0,5800.0,6.3,0,1,0,0,0,0,1
168,65.0,70.0,1.015,4.0,4.0,1,1,1,0,307.0,...,39.0,6700.0,3.2,1,2,0,0,0,0,0
226,64.0,100.0,1.015,4.0,2.0,0,0,0,1,163.0,...,26.0,7500.0,3.4,1,2,0,0,1,0,0
68,65.0,70.0,1.010,2.0,0.0,1,1,1,0,112.0,...,37.0,9200.0,4.3,0,1,0,0,0,0,0
60,67.0,90.0,1.020,1.0,0.0,1,0,1,0,141.0,...,45.0,10200.0,5.6,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,37.0,60.0,1.025,0.0,0.0,1,1,0,0,111.0,...,50.0,5500.0,5.7,0,1,0,0,0,0,1
35,65.0,90.0,1.020,2.0,1.0,0,1,0,0,270.0,...,36.0,9800.0,4.9,1,2,0,1,0,1,0
382,48.0,80.0,1.025,0.0,0.0,1,1,0,0,75.0,...,51.0,6000.0,6.5,0,1,0,0,0,0,1
293,50.0,80.0,1.020,0.0,0.0,1,1,0,0,92.0,...,48.0,4700.0,5.4,0,1,0,0,0,0,1


In [29]:
clf.predict_proba([[5.0,	70.0,	1.025,	1.0,	0.0,	0,	1,	0,	0,	97.0,	
             56, 3.8, 111, 2.5, 11.2,
             34.0,	7200.0,	4.1,	0,	1,	0,	1,	1, 0]])

array([[9.99709763e-01, 2.90236627e-04]])

In [30]:
input_features = [5.0,	70.0,	1.025,	1.0,	0.0,	0,	1,	0,	0,	97.0,	
             56, 3.8, 111, 2.5, 11.2,
             34.0,	7200.0,	4.1,	0,	1,	0,	1,	1, 0]
ckd_prob = clf.predict_proba([input_features])[0][1]

In [31]:
ckd_prob

0.0002902366268631531