In [115]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.impute import KNNImputer


In [116]:
df = pd.read_csv("kidney_disease.csv")
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [117]:
features = df.columns.tolist()
features.remove('classification') 

Aparentemente temos muitas variáveis categóricas. Irei atrás do repositório oficial do dataset para compreender estas variáveis:

1.Age(numerical) <br>
- age in years<br>

2.Blood Pressure(numerical)<br>
- bp in mm/Hg<br>

3.Specific Gravity(nominal)(Urina)<br>  
- sg - (1.005,1.010,1.015,1.020,1.025)<br>

4.Albumin(nominal)<br>
- al - (0,1,2,3,4,5)<br>

5.Sugar(nominal)<br>
- su - (0,1,2,3,4,5)<br>

6.Red Blood Cells(nominal)<br>
- rbc - (normal,abnormal)<br>

7.Pus Cell (nominal)<br>
- pc - (normal,abnormal)<br>

8.Pus Cell clumps(nominal)<br>
- pcc - (present,notpresent)<br>

9.Bacteria(nominal)<br>
- ba  - (present,notpresent)<br>

10.Blood Glucose Random(numerical)		<br>
- bgr in mgs/dl<br>

11.Blood Urea(numerical)	<br>
- bu in mgs/dl<br>

12.Serum Creatinine(numerical)	<br>
- sc in mgs/dl<br>

13.Sodium(numerical)<br>
- sod in mEq/L<br>

14.Potassium(numerical)	<br>
- pot in mEq/L<br>

15.Hemoglobin(numerical)<br>
- hemo in gms<br>

16.Packed  Cell Volume(numerical)<br>

17.White Blood Cell Count(numerical)<br>
- wc in cells/cumm<br>

18.Red Blood Cell Count(numerical)	<br>
- rc in millions/cmm<br>

19.Hypertension(nominal)	<br>
- htn - (yes,no)<br>

20.Diabetes Mellitus(nominal)	<br>
- dm - (yes,no)<br>

21.Coronary Artery Disease(nominal)<br>
- cad - (yes,no)<br>

22.Appetite(nominal)	<br>
- appet - (good,poor)<br>

23.Pedal Edema(nominal)<br>
- pe - (yes,no)	<br>

24.Anemia(nominal)<br>
- ane - (yes,no)<br>

25.Class (nominal)		<br>
- class - (ckd,notckd)<br>

---

As minhas variáveis de interesse são: `age, anemia, wc, rc, hemoglobin, bu, bgr, ba, pcc, bp, al, su, rbc, sc, pot, sod`

In [118]:

dfSemNa = df[features].dropna()

In [119]:
itens_f = []
for feature in features:
    sub = df[feature].isna().sum()
    print(f"Feature: {feature} | sub: {sub}")
    
    

Feature: id | sub: 0
Feature: age | sub: 9
Feature: bp | sub: 12
Feature: sg | sub: 47
Feature: al | sub: 46
Feature: su | sub: 49
Feature: rbc | sub: 152
Feature: pc | sub: 65
Feature: pcc | sub: 4
Feature: ba | sub: 4
Feature: bgr | sub: 44
Feature: bu | sub: 19
Feature: sc | sub: 17
Feature: sod | sub: 87
Feature: pot | sub: 88
Feature: hemo | sub: 52
Feature: pcv | sub: 70
Feature: wc | sub: 105
Feature: rc | sub: 130
Feature: htn | sub: 2
Feature: dm | sub: 2
Feature: cad | sub: 2
Feature: appet | sub: 1
Feature: pe | sub: 1
Feature: ane | sub: 1


In [None]:
dffillmode = df['Native Country'].fillna(df['Native Country'].mode(), inplace=True)

In [120]:
dfDropNaRBC = df
dfDropNaRBC.dropna(subset = ['rbc'], inplace=np.median(df['']))
dfDropNaRBC = dfDropNaRBC.drop("id", axis=1)
dfDropNaRBC

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,...,44,6900,5,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,...,33,9600,4.0,yes,yes,no,good,no,yes,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [143]:
ohe = OneHotEncoder()

featuresCateg = ohe.fit_transform(df[["ane",'pc', "ba", "pcc",'htn','dm','cad','appet','classification']]).toarray()

In [144]:
featuresCateg

array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [None]:
colunasLabels2 = ohe.categories_
colunasLabels2 = np.array(colunasLabels2).ravel()

In [145]:
X = [features]
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)

ValueError: could not convert string to float: 'id'

In [122]:
X = dfSemNa[features]

In [123]:
X = X.dropna().reset_index()

In [124]:
X

Unnamed: 0,index,id,age,bp,sg,al,su,rbc,pc,pcc,...,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane
0,3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,...,11.2,32,6700,3.9,yes,no,no,poor,yes,yes
1,9,9,53.0,90.0,1.020,2.0,0.0,abnormal,abnormal,present,...,9.5,29,12100,3.7,yes,yes,no,poor,no,yes
2,11,11,63.0,70.0,1.010,3.0,0.0,abnormal,abnormal,present,...,10.8,32,4500,3.8,yes,yes,no,poor,yes,no
3,14,14,68.0,80.0,1.010,3.0,2.0,normal,abnormal,present,...,5.6,16,11000,2.6,yes,yes,yes,poor,yes,no
4,20,20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,...,7.7,24,9200,3.2,yes,yes,yes,poor,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,...,15.7,47,6700,4.9,no,no,no,good,no,no
154,396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,...,16.5,54,7800,6.2,no,no,no,good,no,no
155,397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,...,15.8,49,6600,5.4,no,no,no,good,no,no
156,398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,...,14.2,51,7200,5.9,no,no,no,good,no,no


Metade das minhas rows se foram embora. Acredito que uma tentativa de KNN Input seria o ideal. Deixarei para perguntar se posso usar na próxima vez.

In [125]:
ohe = OneHotEncoder()

featuresCateg = ohe.fit_transform(X[["ane", "ba", "pcc"]]).toarray()

In [126]:
colunasLabels = ohe.categories_
colunasLabels = np.array(colunasLabels).ravel()

In [127]:
colunasLabels 

array(['no', 'yes', 'notpresent', 'present', 'notpresent', 'present'],
      dtype=object)

In [128]:
colunasArrumadas = pd.DataFrame(featuresCateg, columns=["N_Ane", "Y_Ane", "NP_Ba", "P_Ba", "NP_Pcc", "P_Pcc"])

In [129]:
colunasArrumadas

Unnamed: 0,N_Ane,Y_Ane,NP_Ba,P_Ba,NP_Pcc,P_Pcc
0,0.0,1.0,1.0,0.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...
153,1.0,0.0,1.0,0.0,1.0,0.0
154,1.0,0.0,1.0,0.0,1.0,0.0
155,1.0,0.0,1.0,0.0,1.0,0.0
156,1.0,0.0,1.0,0.0,1.0,0.0


In [130]:
X_new = pd.concat([X, colunasArrumadas], axis=1)
X_new = X_new.drop(columns=["index", "ane", "ba", "pcc"])
X_new

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,bgr,bu,...,dm,cad,appet,pe,N_Ane,Y_Ane,NP_Ba,P_Ba,NP_Pcc,P_Pcc
0,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,117.0,56.0,...,no,no,poor,yes,0.0,1.0,1.0,0.0,0.0,1.0
1,9,53.0,90.0,1.020,2.0,0.0,abnormal,abnormal,70.0,107.0,...,yes,no,poor,no,0.0,1.0,1.0,0.0,0.0,1.0
2,11,63.0,70.0,1.010,3.0,0.0,abnormal,abnormal,380.0,60.0,...,yes,no,poor,yes,1.0,0.0,1.0,0.0,0.0,1.0
3,14,68.0,80.0,1.010,3.0,2.0,normal,abnormal,157.0,90.0,...,yes,yes,poor,yes,1.0,0.0,0.0,1.0,0.0,1.0
4,20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,173.0,148.0,...,yes,yes,poor,yes,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,395,55.0,80.0,1.020,0.0,0.0,normal,normal,140.0,49.0,...,no,no,good,no,1.0,0.0,1.0,0.0,1.0,0.0
154,396,42.0,70.0,1.025,0.0,0.0,normal,normal,75.0,31.0,...,no,no,good,no,1.0,0.0,1.0,0.0,1.0,0.0
155,397,12.0,80.0,1.020,0.0,0.0,normal,normal,100.0,26.0,...,no,no,good,no,1.0,0.0,1.0,0.0,1.0,0.0
156,398,17.0,60.0,1.025,0.0,0.0,normal,normal,114.0,50.0,...,no,no,good,no,1.0,0.0,1.0,0.0,1.0,0.0


In [131]:
Y = dfSemNa[["classification"]]
Y

KeyError: "None of [Index(['classification'], dtype='object')] are in the [columns]"

In [None]:
ohe2 = OneHotEncoder()

targetCateg = ohe.fit_transform(Y[["classification"]]).toarray()

In [None]:
colunasLabels2 = ohe.categories_
colunasLabels2 = np.array(colunasLabels2).ravel()

In [None]:
colunasLabels2

array(['ckd', 'notckd'], dtype=object)

In [None]:
targetArrumado = pd.DataFrame(targetCateg, columns=colunasLabels2)

In [None]:
targetArrumado

Unnamed: 0,ckd,notckd
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
196,0.0,1.0
197,0.0,1.0
198,0.0,1.0
199,0.0,1.0
