### Simple Imputer

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

data = np.array([[1, 2, np.nan],
                 [3, np.nan, 4],
                 [5, 6, 7],
                 [np.nan, 8, 9]])

# Initialize the SimpleImputer with a strategy (e.g., mean, median, most_frequent, constant)
data

In [None]:
imputer = SimpleImputer(strategy='mean')

imputed_data = imputer.fit_transform(data)

print("\nImputed Data (with mean strategy):\n", imputed_data)

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

data = np.array([[1, 2, np.nan],
                 [3, np.nan, 4],
                 [5, 6, 7],
                 [np.nan, 8, 9],
                 [1, 2, 2]])  

#  Using 'most_frequent' strategy
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
imputed_data_most_frequent = imputer_most_frequent.fit_transform(data)

print("Imputed Data (most_frequent strategy):\n", imputed_data_most_frequent)

'most_frequent': Useful for categorical data or when the most common value is a good representation of the missing data.

'constant': Useful when you want to replace missing values with a specific value (e.g., 0, -1, or any other placeholder).

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

data = np.array([[1, 2, np.nan],
                 [3, np.nan, 4],
                 [5, 6, 7],
                 [np.nan, 8, 9],
                 [1, 2, 2]])

# Using 'constant' strategy with fill_value=0
imputer_constant = SimpleImputer(strategy='constant', fill_value=10)
imputed_data_constant = imputer_constant.fit_transform(data)  

print("Imputed Data (constant strategy):\n", imputed_data_constant)

### KNN Imputer

In [None]:
import numpy as np
from sklearn.impute import KNNImputer

In [None]:
data = np.array([[1, 2, np.nan],
                 [3, 4, 3],
                 [7, 6, 8],
                 [np.nan, 5, 9],
                 [2, np.nan, 5]])

In [None]:
imputer = KNNImputer(n_neighbors=2)  

imputed_data = imputer.fit_transform(data)

In [None]:
print("Original Data with Missing Values:")
print(data)

print("\nImputed Data:")
print(imputed_data)

The n_neighbors parameter specifies the number of nearest neighbors to use for imputation.

The missing values (np.nan) are replaced with the mean of the nearest neighbors' values.

---

We can customize the KNNImputer by adjusting parameters such as:

    n_neighbors: Number of neighbors to use.

    weights: Weight function used in prediction (e.g., uniform, distance).

    metric: Distance metric to use for finding neighbors (e.g., euclidean, manhattan).

---

### Example

In [4]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

data = {
    'Color': ['Red', 'Blue', np.nan, 'Green', 'Blue', 'Red', np.nan, 'Blue'],
    'Size': ['S', 'M', 'L', np.nan, 'M', 'M', 'S', np.nan],
    'Price': [10, 15, 20, 25, 30, np.nan, 40, 45]
}
df = pd.DataFrame(data)
print(df)


   Color Size  Price
0    Red    S   10.0
1   Blue    M   15.0
2    NaN    L   20.0
3  Green  NaN   25.0
4   Blue    M   30.0
5    Red    M    NaN
6    NaN    S   40.0
7   Blue  NaN   45.0


In [5]:

cat_imputer = SimpleImputer(strategy='most_frequent')

df[['Color','Size']] = cat_imputer.fit_transform(df[['Color','Size']])
print(df)

   Color Size  Price
0    Red    S   10.0
1   Blue    M   15.0
2   Blue    L   20.0
3  Green    M   25.0
4   Blue    M   30.0
5    Red    M    NaN
6   Blue    S   40.0
7   Blue    M   45.0


In [None]:

num_imputer = SimpleImputer(strategy='median')

df[['Price']] = num_imputer.fit_transform(df[['Price']])

print(df)