#### Data Description:
We use the following representation to collect the dataset

age - age

bp - blood pressure

sg - specific gravity

al - albumin

su - sugar

rbc - red blood cells

pc - pus cell

pcc - pus cell clumps

ba - bacteria

bgr - blood glucose random

bu - blood urea

sc - serum creatinine

sod - sodium

pot - potassium

hemo - hemoglobin

pcv - packed cell volume

wc - white blood cell count

rc - red blood cell count

htn - hypertension

dm - diabetes mellitus

cad - coronary artery disease

appet - appetite

pe - pedal edema

ane - anemia

class - class

In [1]:
# Importing libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading data
df = pd.read_csv("../input/ckdisease/kidney_disease.csv")

In [3]:
df.head()

In [4]:
df.shape

In [5]:
df.info()

In [6]:
df.isnull().sum()

In [7]:
# dropping id column
df.drop('id', axis = 1, inplace = True)

In [8]:
df.describe()

In [9]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [10]:
df.columns

In [11]:
object_dtypes = df.select_dtypes(include = 'object')
object_dtypes

In [12]:
# Mapping
df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
df['classification'] = df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})

In [13]:
for i in df['pcv']:
    print(i)

In [14]:
for i in df['wc']:
    print(i)

In [15]:
for i in df['rc']:
    print(i)

In [16]:
df['wc']=df['wc'].replace(["\t6200","\t8400","\t?"],[6200,8400, np.nan])
df['pcv']=df['pcv'].replace(["\t43","\t?"],[43,np.nan])
df['rc']=df['rc'].replace(["\t?"],[np.nan])

In [17]:
df['wc'].unique()

In [18]:
df['pcv'].unique()

In [19]:
df['rc'].unique()

In [20]:
df = df.fillna(method='ffill')
df = df.fillna(method='backfill')

In [21]:
df.info()

In [22]:
df.isna().sum()

In [23]:
df['pcv']=df['pcv'].astype(int)
df['wc']=df['wc'].astype(int)
df['rc']=df['rc'].astype(float)

In [24]:
# Further cleaning
df['pe'] = df['pe'].replace(to_replace='good',value=0)
df['appet'] = df['appet'].replace(to_replace='no',value=0)
df['cad'] = df['cad'].replace(to_replace='\tno',value=0)
df['dm'] = df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})

In [25]:
df.info()

In [26]:
df['classification'].value_counts()

In [27]:
target_true_count = len(df.loc[df['classification'] == 1])
target_false_count = len(df.loc[df['classification'] == 0])
target_true_count, target_false_count

In [28]:
# plotting graph for output classes counts
sns.countplot(x = 'classification',data = df)

In [29]:
# plotting variation graphs for each property
df.hist(figsize = (30,30))

In [30]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [31]:
from sklearn.model_selection import train_test_split
feature_columns = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu','sc', 'sod', 
                   'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad','appet', 'pe', 'ane']
predicted_class = ['classification']

In [32]:
X = df[feature_columns]
y = df[predicted_class]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

In [33]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=10)

model = random_forest_model.fit(X_train, y_train)

In [34]:
predict_train_data = model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, predict_train_data)))

In [35]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predict_train_data)
cm

In [36]:
import joblib
joblib.dump(model, "./rf_kidney.joblib")