In [45]:
import pandas as pd

data = {"Name": ["Anna", "Bob", "Charlie", "Diana", "Eric"],
       "Age": [20, 34, 23, None, 33],
       "Gender": ["f", "m", "m", "f", "m"],
       "Job": ["Programmer", "Writer", "Cook", "Programmer", "Teacher"]}

df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


Preprocessing Pipeline:

* Drop Name Feature
* Impute Age
* Turn Gender into Binary / Numeric
* One Hot Encode Jobs

# Without Pipeline

In [42]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Drop Name Feature
df = df.drop(["Name"], axis =1)

# Impute ages
imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

# Numeric Gender
gender_dct = {"m": 0, "f": 1}
df["Gender"] = [gender_dct[g] for g in df["Gender"]]

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(drop="first")
# Fit and transform the "Job" column using the encoder
job_encoded = encoder.fit_transform(df[["Job"]]).toarray()
# Get the feature names for the encoded columns
encoded_feature_names = encoder.get_feature_names_out(["Job"])
# Create a DataFrame with the encoded job columns
job_encoded_df = pd.DataFrame(job_encoded, columns=encoded_feature_names)
# Drop the original "Job" column
df = df.drop(columns=["Job"])
# Concatenate the original DataFrame and the encoded job DataFrame
df_encoded = pd.concat([df, job_encoded_df], axis=1)

# With Pipeline

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(["Name"], axis =1)
    
class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Impute ages
        imputer = SimpleImputer(strategy="mean")
        X["Age"] = imputer.fit_transform(X[["Age"]])
        return X
    
class FeatureEncoder(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            # Numeric Gender
            gender_dct = {"m": 0, "f": 1}
            X["Gender"] = [gender_dct[g] for g in X["Gender"]]

            # Create an instance of the OneHotEncoder
            encoder = OneHotEncoder(drop="first")
            # Fit and transform the "Job" column using the encoder
            job_encoded = encoder.fit_transform(X[["Job"]]).toarray()
            # Get the feature names for the encoded columns
            encoded_feature_names = encoder.get_feature_names_out(["Job"])
            # Create a DataFrame with the encoded job columns
            job_encoded_df = pd.DataFrame(job_encoded, columns=encoded_feature_names)
            # Drop the original "Job" column
            X = X.drop(columns=["Job"])
            # Concatenate the original DataFrame and the encoded job DataFrame
            df_encoded = pd.concat([X, job_encoded_df], axis=1)
            return df_encoded

In [47]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("dropper", NameDropper()),
    ("imputer", AgeImputer()),
    ("encoder", FeatureEncoder())
])
pipe.fit_transform(df)

Unnamed: 0,Age,Gender,Job_Programmer,Job_Teacher,Job_Writer
0,20.0,1,1.0,0.0,0.0
1,34.0,0,0.0,0.0,1.0
2,23.0,0,0.0,0.0,0.0
3,27.5,1,1.0,0.0,0.0
4,33.0,0,0.0,1.0,0.0
