# Exercises on Imputer / Transformer

These exercises focus on creating and testing custom imputer classes in Python using NumPy and Pandas. The goal is to handle missing data in datasets by implementing different strategies for imputing missing values.

## Task a: Write a `MyImputer` Class

Create a custom imputer class that handles missing values represented as 0. The class should support different imputation strategies based on the provided parameter.

In [5]:
import numpy as np
import pandas as pd

class MyImputer:
    def __init__(self, strategy="mean"):
        self.strategy = strategy
        self.mean_values = {}
    
    def fit(self, X):
        if self.strategy == "mean":
            # Calculate mean for each column, ignoring zeros
            self.mean_values = X.replace(0, np.nan).mean()
        elif self.strategy == "feature":
            # No fitting required for 'feature' strategy
            pass
        else:
            raise ValueError("Unsupported strategy")
    
    def transform(self, X):
        X_transformed = X.copy()
        if self.strategy == "mean":
            # Replace zeros with the mean of the column
            for column in X_transformed.columns:
                X_transformed[column] = X_transformed[column].replace(0, self.mean_values[column])
        elif self.strategy == "feature":
            # Replace zeros with the column number (starting from 1)
            for idx, column in enumerate(X_transformed.columns, start=1):
                X_transformed[column] = X_transformed[column].replace(0, idx)
        else:
            raise ValueError("Unsupported strategy")
        return X_transformed


## Task c: Write a Test for `MyImputer`

Test the MyImputer class by creating a sample dataset, introducing missing values (represented as 0), and verifying that the imputer correctly replaces these missing values based on the chosen strategy.

In [6]:
# Sample DataFrame with missing values represented as 0
data = {
    'Age': [25, 0, 35, 40, 0],
    'Salary': [50000, 60000, 0, 80000, 90000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
   Age  Salary
0   25   50000
1    0   60000
2   35       0
3   40   80000
4    0   90000


In [7]:
# Initialize the imputer with 'mean' strategy
imputer_mean = MyImputer(strategy="mean")

# Fit the imputer to the data
imputer_mean.fit(df)

# Transform the data
df_imputed_mean = imputer_mean.transform(df)
print("\nDataFrame after Mean Imputation:")
print(df_imputed_mean)



DataFrame after Mean Imputation:
         Age  Salary
0  25.000000   50000
1  33.333333   60000
2  35.000000   70000
3  40.000000   80000
4  33.333333   90000


In [8]:
# Initialize the imputer with 'feature' strategy
imputer_feature = MyImputer(strategy="feature")

# Fit the imputer to the data
imputer_feature.fit(df)

# Transform the data
df_imputed_feature = imputer_feature.transform(df)
print("\nDataFrame after Feature Imputation:")
print(df_imputed_feature)



DataFrame after Feature Imputation:
   Age  Salary
0   25   50000
1    1   60000
2   35       2
3   40   80000
4    1   90000


# Level 2

## Task a: MyImputer with -999 as Missing Value Indicator

Modify the MyImputer class to recognize -999 as the indicator for missing values instead of 0.

In [9]:
class MyImputer:
    def __init__(self, strategy="mean", missing_values=0):
        self.strategy = strategy
        self.missing_values = missing_values
        self.mean_values = {}
    
    def fit(self, X):
        if self.strategy == "mean":
            # Calculate mean for each column, ignoring missing_values
            X_replaced = X.replace(self.missing_values, np.nan)
            self.mean_values = X_replaced.mean()
        elif self.strategy == "feature":
            # No fitting required for 'feature' strategy
            pass
        else:
            raise ValueError("Unsupported strategy")
    
    def transform(self, X):
        X_transformed = X.copy()
        if self.strategy == "mean":
            # Replace missing_values with the mean of the column
            for column in X_transformed.columns:
                X_transformed[column] = X_transformed[column].replace(self.missing_values, self.mean_values[column])
        elif self.strategy == "feature":
            # Replace missing_values with the column number (starting from 1)
            for idx, column in enumerate(X_transformed.columns, start=1):
                X_transformed[column] = X_transformed[column].replace(self.missing_values, idx)
        else:
            raise ValueError("Unsupported strategy")
        return X_transformed


In [10]:
# Sample DataFrame with missing values represented as -999
data = {
    'Age': [25, -999, 35, 40, -999],
    'Salary': [50000, 60000, -999, 80000, 90000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the imputer with 'mean' strategy and missing_values=-999
imputer_mean = MyImputer(strategy="mean", missing_values=-999)

# Fit and transform the data
imputer_mean.fit(df)
df_imputed_mean = imputer_mean.transform(df)
print("\nDataFrame after Mean Imputation:")
print(df_imputed_mean)


Original DataFrame:
   Age  Salary
0   25   50000
1 -999   60000
2   35    -999
3   40   80000
4 -999   90000

DataFrame after Mean Imputation:
         Age  Salary
0  25.000000   50000
1  33.333333   60000
2  35.000000   70000
3  40.000000   80000
4  33.333333   90000


## Task b: MyImputer with np.nan as Missing Value Indicator

Further modify the MyImputer class to recognize np.nan as the indicator for missing values.

In [11]:
# Sample DataFrame with missing values as np.nan
data = {
    'Age': [25, np.nan, 35, 40, np.nan],
    'Salary': [50000, 60000, np.nan, 80000, 90000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the imputer with 'mean' strategy and missing_values=np.nan
imputer_mean = MyImputer(strategy="mean", missing_values=np.nan)

# Fit and transform the data
imputer_mean.fit(df)
df_imputed_mean = imputer_mean.transform(df)
print("\nDataFrame after Mean Imputation:")
print(df_imputed_mean)


Original DataFrame:
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  35.0      NaN
3  40.0  80000.0
4   NaN  90000.0

DataFrame after Mean Imputation:
         Age   Salary
0  25.000000  50000.0
1  33.333333  60000.0
2  35.000000  70000.0
3  40.000000  80000.0
4  33.333333  90000.0


## Task c: MyImputer with Multiple Missing Value Indicators

Enhance the MyImputer class to handle a list of different missing value indicators. Each missing value should be replaced using the specified strategy.

In [12]:
class MyImputer:
    def __init__(self, strategy="mean", missing_values=[0]):
        self.strategy = strategy
        self.missing_values = missing_values
        self.mean_values = {}
    
    def fit(self, X):
        if self.strategy == "mean":
            # Replace all missing_values with np.nan
            X_replaced = X.replace(self.missing_values, np.nan)
            self.mean_values = X_replaced.mean()
        elif self.strategy == "feature":
            # No fitting required for 'feature' strategy
            pass
        else:
            raise ValueError("Unsupported strategy")
    
    def transform(self, X):
        X_transformed = X.copy()
        if self.strategy == "mean":
            # Replace all missing_values with the mean of the column
            for column in X_transformed.columns:
                X_transformed[column] = X_transformed[column].replace(self.missing_values, self.mean_values[column])
        elif self.strategy == "feature":
            # Replace all missing_values with the column number (starting from 1)
            for idx, column in enumerate(X_transformed.columns, start=1):
                X_transformed[column] = X_transformed[column].replace(self.missing_values, idx)
        else:
            raise ValueError("Unsupported strategy")
        return X_transformed


In [13]:
# Sample DataFrame with multiple missing value indicators
data = {
    'Age': [25, -999, 35, 40, 'NA'],
    'Salary': [50000, 60000, -999, 80000, 90000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the imputer with 'mean' strategy and multiple missing_values
imputer_mean = MyImputer(strategy="mean", missing_values=[0, -999, 'NA'])

# Fit and transform the data
imputer_mean.fit(df)
df_imputed_mean = imputer_mean.transform(df)
print("\nDataFrame after Mean Imputation:")
print(df_imputed_mean)


Original DataFrame:
    Age  Salary
0    25   50000
1  -999   60000
2    35    -999
3    40   80000
4    NA   90000

DataFrame after Mean Imputation:
         Age  Salary
0  25.000000   50000
1  33.333333   60000
2  35.000000   70000
3  40.000000   80000
4  33.333333   90000


  X_replaced = X.replace(self.missing_values, np.nan)
  X_transformed[column] = X_transformed[column].replace(self.missing_values, self.mean_values[column])


## Level 2d: Extend MyImputer with "feature" Strategy

Modify the MyImputer class to handle the "feature" strategy, which replaces missing values with the respective column number.

In [14]:
# Sample DataFrame with multiple missing value indicators
data = {
    'Age': [25, -999, 35, 40, 'NA'],
    'Salary': [50000, 60000, -999, 80000, 90000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the imputer with 'feature' strategy and multiple missing_values
imputer_feature = MyImputer(strategy="feature", missing_values=[0, -999, 'NA'])

# Fit and transform the data
imputer_feature.fit(df)
df_imputed_feature = imputer_feature.transform(df)
print("\nDataFrame after Feature Imputation:")
print(df_imputed_feature)


Original DataFrame:
    Age  Salary
0    25   50000
1  -999   60000
2    35    -999
3    40   80000
4    NA   90000

DataFrame after Feature Imputation:
   Age  Salary
0   25   50000
1    1   60000
2   35       2
3   40   80000
4    1   90000


  X_transformed[column] = X_transformed[column].replace(self.missing_values, idx)


## Level 3

## Write a GroupImputer Class

Create a custom imputer class that handles missing values by grouping the data based on a categorical target variable and imputing missing values with the mean of each group.

Example Scenario: Imagine a dataset containing information about fruits with some missing values:

Index	Feature1	Feature2	Category
1	10	Apfel	3
2	12	Apfel	2
3	??	Apfel	
4	??	Birne	50
5	55	Birne	60
6	56	Birne	
Missing values are represented by ??. The GroupImputer will replace these missing values with the mean of their respective categories (Apfel or Birne).

In [15]:
import numpy as np
import pandas as pd


class GroupImputer:
    def __init__(self, target_column, missing_values=[0, -999, 'NA', '??']):
        self.target_column = target_column
        self.missing_values = missing_values
        self.group_means = {}
    
    def fit(self, X):
        # Replace missing values with np.nan
        X_replaced = X.replace(self.missing_values, np.nan)
        # Calculate mean for each group
        self.group_means = X_replaced.groupby(self.target_column).mean()
    
    def transform(self, X):
        X_transformed = X.copy()
        for category in X_transformed[self.target_column].unique():
            # Get mean values for the category
            means = self.group_means.loc[category]
            # Replace missing values with group means
            X_transformed.loc[X_transformed[self.target_column] == category] = X_transformed.loc[X_transformed[self.target_column] == category].replace(self.missing_values, means)
        return X_transformed



In [16]:
# Sample DataFrame with missing values represented as '??'
data = {
    'Feature1': [10, 12, '??', '??', 55, 56],
    'Category': ['Apfel', 'Apfel', 'Apfel', 'Birne', 'Birne', 'Birne']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Convert 'Feature1' to numeric, coercing errors to NaN
df['Feature1'] = pd.to_numeric(df['Feature1'], errors='coerce')

# Initialize the GroupImputer
group_imputer = GroupImputer(target_column='Category', missing_values=['??'])

# Fit the imputer to the data
group_imputer.fit(df)

# Transform the data
df_imputed = group_imputer.transform(df)
print("\nDataFrame after Group Imputation:")
print(df_imputed)


Original DataFrame:
  Feature1 Category
0       10    Apfel
1       12    Apfel
2       ??    Apfel
3       ??    Birne
4       55    Birne
5       56    Birne

DataFrame after Group Imputation:
   Feature1 Category
0      10.0    Apfel
1      12.0    Apfel
2       NaN    Apfel
3       NaN    Birne
4      55.0    Birne
5      56.0    Birne
