In [41]:
import pandas as pd

data = {
        "Name":["bilo", "dilip", "bunni", "anisaa", "remo","aravind"],
        "Age":[21,21,22,22,23,None],
        "Gender":["m","m","m","f","m","m"],
        "Job":["ai engineer","web developer","ai engineer","manager","manager","actor"]

}

In [42]:
df = pd.DataFrame(data)

In [43]:
df

Unnamed: 0,Name,Age,Gender,Job
0,bilo,21.0,m,ai engineer
1,dilip,21.0,m,web developer
2,bunni,22.0,m,ai engineer
3,anisaa,22.0,f,manager
4,remo,23.0,m,manager
5,aravind,,m,actor


In [44]:
# preprocessing pipeline
# drop name feature
# impute ages
# one hot encoded jobs


In [45]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# drop name feature
df = df.drop(["Name"],axis = 1);

df

Unnamed: 0,Age,Gender,Job
0,21.0,m,ai engineer
1,21.0,m,web developer
2,22.0,m,ai engineer
3,22.0,f,manager
4,23.0,m,manager
5,,m,actor


In [46]:
# impute ages - fill the missing values with the mean

imputer = SimpleImputer(strategy = "mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

df

Unnamed: 0,Age,Gender,Job
0,21.0,m,ai engineer
1,21.0,m,web developer
2,22.0,m,ai engineer
3,22.0,f,manager
4,23.0,m,manager
5,21.8,m,actor


In [47]:
# numeric gender
gender_dct = {"m":0,"f":1}
df["Gender"] = [gender_dct[g] for g in df["Gender"]]

df

Unnamed: 0,Age,Gender,Job
0,21.0,0,ai engineer
1,21.0,0,web developer
2,22.0,0,ai engineer
3,22.0,1,manager
4,23.0,0,manager
5,21.8,0,actor


In [55]:
# one hot encode jobs

encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[["Job"]]).toarray()


column_names = ["actor", "ai engineer", "manager", "web developer"]


In [56]:

matrix

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [57]:
matrix.T

array([[0., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [58]:
len(matrix.T)

4

In [59]:
matrix.T[0]

array([0., 0., 0., 0., 0., 1.])

In [66]:
for i in range(len(matrix.T)):
  df[column_names[i]] = matrix.T[i]

df = df.drop(['Job'],axis = 1)


In [67]:

df

Unnamed: 0,Age,Gender,ai engineer,web developer,manager,actor
0,21.0,0,1.0,0.0,0.0,0.0
1,21.0,0,0.0,1.0,0.0,0.0
2,22.0,0,1.0,0.0,0.0,0.0
3,22.0,1,0.0,0.0,1.0,0.0
4,23.0,0,0.0,0.0,1.0,0.0
5,21.8,0,0.0,0.0,0.0,1.0


---------------------------------------------------------------------------


In [69]:
from sklearn.base import BaseEstimator, TransformerMixin

class nameDropper(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
      return self

    def transform(self,x):
      return x.drop(["Name"],axis = 1)



In [70]:
data_2 = {
        "Name":["gopi", "jabi", "buni", "ani", "remoo","ak"],
        "Age":[27,21,25,22,23,None],
        "Gender":["m","m","m","f","f","m"],
        "Job":["ai engineer","web developer","ai engineer","web developer","manager","actor"]

}

In [71]:
df_2 = pd.DataFrame(data_2)

In [72]:
df_2

Unnamed: 0,Name,Age,Gender,Job
0,gopi,27.0,m,ai engineer
1,jabi,21.0,m,web developer
2,buni,25.0,m,ai engineer
3,ani,22.0,f,web developer
4,remoo,23.0,f,manager
5,ak,,m,actor


In [85]:
class ageImputer(BaseEstimator,TransformerMixin):
  def fit(self,x,y=None):
    return self

  def transform(self,x):
    imputer = SimpleImputer(strategy="mean")
    x["Age"] = imputer.fit_transform(x[["Age"]])
    return x

class featureEncoder(BaseEstimator,TransformerMixin):
  def fit(self,x,y=None):
    return self

  def transform(self,x):
    gender_dct = {"m":0,"f":1}
    x["Gender"] = [gender_dct[g] for g in x["Gender"]]

    encoder = OneHotEncoder()
    matrix = encoder.fit_transform(x[["Job"]]).toarray()


    column_names = ["actor", "ai engineer", "manager", "web developer"]

    for i in range(len(matrix.T)):
      x[column_names[i]] = matrix.T[i]

    return x.drop(['Job'],axis = 1)






In [86]:

dropper = nameDropper()
imp = ageImputer()
enc = featureEncoder()

enc.fit_transform(imp.fit_transform(dropper.fit_transform(df_2)))

Unnamed: 0,Age,Gender,actor,ai engineer,manager,web developer
0,27.0,0,0.0,1.0,0.0,0.0
1,21.0,0,0.0,0.0,0.0,1.0
2,25.0,0,0.0,1.0,0.0,0.0
3,22.0,1,0.0,0.0,0.0,1.0
4,23.0,1,0.0,0.0,1.0,0.0
5,23.6,0,1.0,0.0,0.0,0.0


In [84]:
-----------------------------------------------------------------------------------------------------------------------------------------------------

In [88]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("dropper",nameDropper()),
    ("imputer",ageImputer()),
    ("encoder",featureEncoder())
])

pipe.fit_transform(df_2)

Unnamed: 0,Age,Gender,actor,ai engineer,manager,web developer
0,27.0,0,0.0,1.0,0.0,0.0
1,21.0,0,0.0,0.0,0.0,1.0
2,25.0,0,0.0,1.0,0.0,0.0
3,22.0,1,0.0,0.0,0.0,1.0
4,23.0,1,0.0,0.0,1.0,0.0
5,23.6,0,1.0,0.0,0.0,0.0


In [89]:
pipe