# Tahmine Dayalı Değer Atama Yöntemleri

## 1. KNN En Yakın Komşuluk Algoritması

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
from ycimpute.imputer import EM
from ycimpute.imputer import knnimput
from ycimpute.imputer import iterforest
# !pip install ycimpute == 0.1.1

In [2]:
df = sns.load_dataset("titanic")
df = df.select_dtypes(include = ["float64","int64"])
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [3]:
df.isnull().sum()

survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

* Öncelikle değişkenlerin isimlerini tutmamız gerekiyor.
* KNN bizden bir np.array bekliyor.
* DataFrame değişken adlarını bir yerde saklayıp daha sonra veri setini np.array yapısına dönüştüreceğiz.

In [4]:
var_names = list(df)
n_df = np.array(df)
n_df[0:10]

array([[ 0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       [ 1.    ,  1.    , 35.    ,  1.    ,  0.    , 53.1   ],
       [ 0.    ,  3.    , 35.    ,  0.    ,  0.    ,  8.05  ],
       [ 0.    ,  3.    ,     nan,  0.    ,  0.    ,  8.4583],
       [ 0.    ,  1.    , 54.    ,  0.    ,  0.    , 51.8625],
       [ 0.    ,  3.    ,  2.    ,  3.    ,  1.    , 21.075 ],
       [ 1.    ,  3.    , 27.    ,  0.    ,  2.    , 11.1333],
       [ 1.    ,  2.    , 14.    ,  1.    ,  0.    , 30.0708]])

In [5]:
n_df.shape

(891, 6)

* **k** parametresi komşuluk sayısını ifade ediyor.
* **complate** doldurmak anlamına geliyor.
* **n_df**'teki boşlukları doldurmak için kullanacağız.

In [6]:
df_2 = knnimput.KNN(k = 4).complete(n_df)

Imputing row 1/891 with 0 missing, elapsed time: 0.094
Imputing row 101/891 with 0 missing, elapsed time: 0.095
Imputing row 201/891 with 0 missing, elapsed time: 0.096
Imputing row 301/891 with 1 missing, elapsed time: 0.097
Imputing row 401/891 with 0 missing, elapsed time: 0.097
Imputing row 501/891 with 0 missing, elapsed time: 0.098
Imputing row 601/891 with 0 missing, elapsed time: 0.099
Imputing row 701/891 with 0 missing, elapsed time: 0.100
Imputing row 801/891 with 0 missing, elapsed time: 0.101


In [7]:
type(df_2)

numpy.ndarray

* Pandas DataFrame yapısına dönüştürmemiz gerekiyor.

In [8]:
df_2 = pd.DataFrame(df_2, columns = var_names)
df_2.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,38.0,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


In [9]:
df_2.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

## 2. Random Forest

In [10]:
df = sns.load_dataset("titanic")
df = df.select_dtypes(include = ["float64","int64"])
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [11]:
df.isnull().sum()

survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

* Değişken isimlerini topluyoruz.

In [12]:
var_names = list(df)
var_names

['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']

In [13]:
n_df = np.array(df)
n_df

array([[ 0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 0.    ,  3.    ,     nan,  1.    ,  2.    , 23.45  ],
       [ 1.    ,  1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 0.    ,  3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [14]:
df_2 = iterforest.IterImput().complete(n_df)
df_2 = pd.DataFrame(df_2, columns = var_names)
df_2.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,38.0,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


In [15]:
df_2.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

## 3. EM Algoritmasi

In [16]:
df = sns.load_dataset("titanic")
df = df.select_dtypes(include = ["float64","int64"])
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [17]:
df.isnull().sum()

survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

In [18]:
var_names = list(df)
var_names

['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']

In [19]:
n_df = np.array(df)
n_df

array([[ 0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 0.    ,  3.    ,     nan,  1.    ,  2.    , 23.45  ],
       [ 1.    ,  1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 0.    ,  3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [20]:
df_2 = EM().complete(n_df)
df_2 = pd.DataFrame(df_2, columns = var_names)
df_2.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,38.0,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


In [21]:
df_2.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64