### Değer Atama Yöntemleri

In [1]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}
)
df

Unnamed: 0,V1,V2,V3
0,1.0,7.0,
1,3.0,,12.0
2,6.0,5.0,5.0
3,,8.0,6.0
4,7.0,12.0,14.0
5,1.0,,7.0
6,,,
7,9.0,2.0,2.0
8,15.0,3.0,31.0


### Sayısal Değişkenlerde Atama

In [2]:
df["V1"].fillna(df["V1"].mean())

0     1.0
1     3.0
2     6.0
3     6.0
4     7.0
5     1.0
6     6.0
7     9.0
8    15.0
Name: V1, dtype: float64

In [4]:
#xler sütunları ifade ediyor
#tüm değişkenler için yapılır
df.apply(lambda x: x.fillna(x.mean()), axis = 0)

Unnamed: 0,V1,V2,V3
0,1.0,7.0,11.0
1,3.0,6.166667,12.0
2,6.0,5.0,5.0
3,6.0,8.0,6.0
4,7.0,12.0,14.0
5,1.0,6.166667,7.0
6,6.0,6.166667,11.0
7,9.0,2.0,2.0
8,15.0,3.0,31.0


In [5]:
#ikini yol
df.fillna(df.mean()[:])

Unnamed: 0,V1,V2,V3
0,1.0,7.0,11.0
1,3.0,6.166667,12.0
2,6.0,5.0,5.0
3,6.0,8.0,6.0
4,7.0,12.0,14.0
5,1.0,6.166667,7.0
6,6.0,6.166667,11.0
7,9.0,2.0,2.0
8,15.0,3.0,31.0


In [6]:
#V1 ve V2 değişkenleri mean ile doldurur
df.fillna(df.mean()["V1":"V2"])

Unnamed: 0,V1,V2,V3
0,1.0,7.0,
1,3.0,6.166667,12.0
2,6.0,5.0,5.0
3,6.0,8.0,6.0
4,7.0,12.0,14.0
5,1.0,6.166667,7.0
6,6.0,6.166667,
7,9.0,2.0,2.0
8,15.0,3.0,31.0


In [7]:
#V3 değişkenini median ile doldurur
df["V3"].fillna(df["V3"].median())

0     7.0
1    12.0
2     5.0
3     6.0
4    14.0
5     7.0
6     7.0
7     2.0
8    31.0
Name: V3, dtype: float64

In [8]:
#üçüncü yol
df.where(pd.notna(df), df.mean(), axis="columns")

Unnamed: 0,V1,V2,V3
0,1.0,7.0,11.0
1,3.0,6.166667,12.0
2,6.0,5.0,5.0
3,6.0,8.0,6.0
4,7.0,12.0,14.0
5,1.0,6.166667,7.0
6,6.0,6.166667,11.0
7,9.0,2.0,2.0
8,15.0,3.0,31.0


### Kategorik Değişken Kırılımında Değer Atama

In [9]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
V4 = np.array(["IT","IT","IK","IK","IK","IK","IK","IT","IT"])
df = pd.DataFrame(
        {"maas" : V1,
         "V2" : V2,
         "V3" : V3,
        "departman" : V4}
)
df

Unnamed: 0,maas,V2,V3,departman
0,1.0,7.0,,IT
1,3.0,,12.0,IT
2,6.0,5.0,5.0,IK
3,,8.0,6.0,IK
4,7.0,12.0,14.0,IK
5,1.0,,7.0,IK
6,,,,IK
7,9.0,2.0,2.0,IT
8,15.0,3.0,31.0,IT


In [10]:
df.groupby("departman")["maas"].mean()

departman
IK    4.666667
IT    7.000000
Name: maas, dtype: float64

In [11]:
df["maas"].fillna(df.groupby("departman")["maas"].transform("mean"))

0     1.000000
1     3.000000
2     6.000000
3     4.666667
4     7.000000
5     1.000000
6     4.666667
7     9.000000
8    15.000000
Name: maas, dtype: float64

### Kategorik Değişkenler için Eksik Değer Atama

In [12]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V4 = np.array(["IT",np.NaN,"IK","IK","IK","IK","IK","IT","IT"],dtype = object)
df = pd.DataFrame(
        {"maas" : V1,
         "departman" : V4}
)
df

Unnamed: 0,maas,departman
0,1.0,IT
1,3.0,
2,6.0,IK
3,,IK
4,7.0,IK
5,1.0,IK
6,,IK
7,9.0,IT
8,15.0,IT


In [13]:
df["departman"].mode()[0]

'IK'

In [15]:
df["departman"].fillna(df["departman"].mode()[0])

0    IT
1    IK
2    IK
3    IK
4    IK
5    IK
6    IK
7    IT
8    IT
Name: departman, dtype: object

In [16]:
#kendisinden sonraki değerle doldurur
df["departman"].fillna(method = "bfill")

0    IT
1    IK
2    IK
3    IK
4    IK
5    IK
6    IK
7    IT
8    IT
Name: departman, dtype: object

In [17]:
#kendisinden önceki değerle doldurur
df["departman"].fillna(method = "ffill")

0    IT
1    IT
2    IK
3    IK
4    IK
5    IK
6    IK
7    IT
8    IT
Name: departman, dtype: object

### Tahmine Dayalı Değer Atama Yöntemleri

In [18]:
import seaborn as sns
import missingno as msno
df = sns.load_dataset('titanic')
df = df.select_dtypes(include = ['float64','int64'])
print(df.head())
df.isnull().sum()

   survived  pclass   age  sibsp  parch     fare
0         0       3  22.0      1      0   7.2500
1         1       1  38.0      1      0  71.2833
2         1       3  26.0      0      0   7.9250
3         1       1  35.0      1      0  53.1000
4         0       3  35.0      0      0   8.0500


survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

In [19]:
!pip install ycimpute

Collecting ycimpute
  Downloading ycimpute-0.2-py3-none-any.whl (35 kB)
Collecting torch>=1.1.0
  Downloading torch-1.8.0-cp38-cp38-win_amd64.whl (190.5 MB)
Installing collected packages: torch, ycimpute
Successfully installed torch-1.8.0 ycimpute-0.2


In [20]:
from ycimpute.imputer import knnimput

In [21]:
var_names = list(df)

In [22]:
n_df = np.array(df)

In [24]:
n_df[0:10]

array([[ 0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       [ 1.    ,  1.    , 35.    ,  1.    ,  0.    , 53.1   ],
       [ 0.    ,  3.    , 35.    ,  0.    ,  0.    ,  8.05  ],
       [ 0.    ,  3.    ,     nan,  0.    ,  0.    ,  8.4583],
       [ 0.    ,  1.    , 54.    ,  0.    ,  0.    , 51.8625],
       [ 0.    ,  3.    ,  2.    ,  3.    ,  1.    , 21.075 ],
       [ 1.    ,  3.    , 27.    ,  0.    ,  2.    , 11.1333],
       [ 1.    ,  2.    , 14.    ,  1.    ,  0.    , 30.0708]])

In [25]:
n_df.shape

(891, 6)

In [27]:
#knn algoritması ile boş değerleri doldurma
dff = knnimput.KNN(k = 4).complete(n_df)

Imputing row 1/891 with 0 missing, elapsed time: 1.703
Imputing row 101/891 with 0 missing, elapsed time: 1.704
Imputing row 201/891 with 0 missing, elapsed time: 1.705
Imputing row 301/891 with 1 missing, elapsed time: 1.706
Imputing row 401/891 with 0 missing, elapsed time: 1.707
Imputing row 501/891 with 0 missing, elapsed time: 1.708
Imputing row 601/891 with 0 missing, elapsed time: 1.709
Imputing row 701/891 with 0 missing, elapsed time: 1.710
Imputing row 801/891 with 0 missing, elapsed time: 1.711


In [28]:
type(dff)

numpy.ndarray

In [29]:
dff = pd.DataFrame(dff, columns = var_names)

In [30]:
type(dff)

pandas.core.frame.DataFrame

In [31]:
dff.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

In [None]:
# EM algoritması ile boş değerleri doldurma

In [32]:
df = sns.load_dataset('titanic')
df = df.select_dtypes(include = ['float64','int64'])
print(df.head())
df.isnull().sum()

   survived  pclass   age  sibsp  parch     fare
0         0       3  22.0      1      0   7.2500
1         1       1  38.0      1      0  71.2833
2         1       3  26.0      0      0   7.9250
3         1       1  35.0      1      0  53.1000
4         0       3  35.0      0      0   8.0500


survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

In [33]:
var_names = list(df)

In [38]:
n_df = np.array(df)

In [40]:
from ycimpute.imputer import EM

In [41]:
dff = EM().complete(n_df)

In [42]:
dff = pd.DataFrame(dff, columns = var_names)

In [43]:
dff.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64