### Simple Imputer

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer

data = np.array([[1, 2, np.nan],
                 [3, np.nan, 4],
                 [5, 6, 7],
                 [np.nan, 8, 9]])

# Initialize the SimpleImputer with a strategy (e.g., mean, median, most_frequent, constant)
data

array([[ 1.,  2., nan],
       [ 3., nan,  4.],
       [ 5.,  6.,  7.],
       [nan,  8.,  9.]])

In [2]:
imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(data)

print("\nImputed Data (with mean strategy):\n", imputed_data)


Imputed Data (with mean strategy):
 [[1.         2.         6.66666667]
 [3.         5.33333333 4.        ]
 [5.         6.         7.        ]
 [3.         8.         9.        ]]


In [3]:
20/3

6.666666666666667

In [4]:
import numpy as np
from sklearn.impute import SimpleImputer

data = np.array([[1, 2, np.nan],
                 [3, np.nan, 4],
                 [5, 6, 7],
                 [np.nan, 8, 9],
                 [1, 2, 2]])  

#  Using 'most_frequent' strategy
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
imputed_data_most_frequent = imputer_most_frequent.fit_transform(data)

print("Imputed Data (most_frequent strategy):\n", imputed_data_most_frequent)

Imputed Data (most_frequent strategy):
 [[1. 2. 2.]
 [3. 2. 4.]
 [5. 6. 7.]
 [1. 8. 9.]
 [1. 2. 2.]]


'most_frequent': Useful for categorical data or when the most common value is a good representation of the missing data.

'constant': Useful when you want to replace missing values with a specific value (e.g., 0, -1, or any other placeholder).

In [5]:
import numpy as np
from sklearn.impute import SimpleImputer

data = np.array([[1, 2, np.nan],
                 [3, np.nan, 4],
                 [5, 6, 7],
                 [np.nan, 8, 9],
                 [1, 2, 2]])

# Using 'constant' strategy with fill_value=0
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)
imputed_data_constant = imputer_constant.fit_transform(data)  

print("Imputed Data (constant strategy):\n", imputed_data_constant)

Imputed Data (constant strategy):
 [[1. 2. 0.]
 [3. 0. 4.]
 [5. 6. 7.]
 [0. 8. 9.]
 [1. 2. 2.]]


### KNN Imputer

In [6]:
import numpy as np
from sklearn.impute import KNNImputer

In [7]:
data = np.array([[1, 2, np.nan],
                 [3, 4, 3],
                 [7, 6, 8],
                 [np.nan, 5, 9],
                 [2, np.nan, 5]])

In [8]:
imputer = KNNImputer(n_neighbors=2)  

imputed_data = imputer.fit_transform(data)

In [9]:
print("Original Data with Missing Values:")
print(data)

print("\nImputed Data:")
print(imputed_data)

Original Data with Missing Values:
[[ 1.  2. nan]
 [ 3.  4.  3.]
 [ 7.  6.  8.]
 [nan  5.  9.]
 [ 2. nan  5.]]

Imputed Data:
[[1. 2. 4.]
 [3. 4. 3.]
 [7. 6. 8.]
 [4. 5. 9.]
 [2. 3. 5.]]


The n_neighbors parameter specifies the number of nearest neighbors to use for imputation.

The missing values (np.nan) are replaced with the mean of the nearest neighbors' values.

---

We can customize the KNNImputer by adjusting parameters such as:

    n_neighbors: Number of neighbors to use.

    weights: Weight function used in prediction (e.g., uniform, distance).

    metric: Distance metric to use for finding neighbors (e.g., euclidean, manhattan).

---

In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

data = {
    'Color': ['Red', 'Blue', np.nan, 'Green', 'Blue', 'Red', np.nan, 'Blue'],
    'Size': ['S', 'M', 'L', np.nan, 'M', 'M', 'S', np.nan],
    'Price': [10, 15, 20, 25, 30, 35, 40, 45]
}
df = pd.DataFrame(data)
print(df)

   Color Size  Price
0    Red    S     10
1   Blue    M     15
2    NaN    L     20
3  Green  NaN     25
4   Blue    M     30
5    Red    M     35
6    NaN    S     40
7   Blue  NaN     45


In [11]:
imputer = SimpleImputer(strategy='most_frequent')

categorical_cols = ['Color', 'Size']
df[categorical_cols] = imputer.fit_transform(df[categorical_cols])

print(df)

   Color Size  Price
0    Red    S     10
1   Blue    M     15
2   Blue    L     20
3  Green    M     25
4   Blue    M     30
5    Red    M     35
6   Blue    S     40
7   Blue    M     45


### pandas Cat data imputation with Mode

In [12]:
df['Color'].mode()[0]

'Blue'

In [13]:
# Using pandas directly to fill with mode
df['Color'] = df['Color'].fillna(df['Color'].mode()[0])
df['Size'] = df['Size'].fillna(df['Size'].mode()[0])

print(df)

   Color Size  Price
0    Red    S     10
1   Blue    M     15
2   Blue    L     20
3  Green    M     25
4   Blue    M     30
5    Red    M     35
6   Blue    S     40
7   Blue    M     45



The mode of 'Color' is 'Blue' (appears 3 times)

The mode of 'Size' is 'M' (appears 3 times)

    All NaN values in these columns will be replaced with their respective modes.

Multiple modes: If there are multiple modes, mode()[0] takes the first one.

Entire dataset: When using SimpleImputer, it's often applied to the entire dataset at once.